<a href="https://colab.research.google.com/github/cconsta1/dion-bone-classification/blob/main/5fold_strattified_dion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Necessary Libraries

In [1]:
# Install Necessary Libraries
!pip install xgboost lightgbm openpyxl scikit-learn --quiet

# Import Required Libraries

In [2]:
# Import Libraries
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import StratifiedKFold, LeaveOneOut
from sklearn.metrics import classification_report, accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import LabelEncoder
import openpyxl
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

print("Libraries Imported!")


Libraries Imported!


# Load and Prepare the Dataset

In [3]:
# Define File Paths (Modify as Needed)
left_bones_path = "/content/drive/MyDrive/Dion-data/Cleaned_Left_Bones.csv"
right_bones_path = "/content/drive/MyDrive/Dion-data/Cleaned_Right_Bones.csv"

# Load Cleaned Datasets
df_mf_left_bones = pd.read_csv(left_bones_path)
df_mf_right_bones = pd.read_csv(right_bones_path)

print(f"Loaded Left Bones Dataset: {df_mf_left_bones.shape} | Right Bones Dataset: {df_mf_right_bones.shape}")

# Shuffle Datasets
df_mf_left_bones = df_mf_left_bones.sample(frac=1, random_state=42).reset_index(drop=True)
df_mf_right_bones = df_mf_right_bones.sample(frac=1, random_state=42).reset_index(drop=True)

# Encode Target Variable (Pelvis Sex)
df_mf_left_bones["Pelvis Sex"] = LabelEncoder().fit_transform(df_mf_left_bones["Pelvis Sex"])
df_mf_right_bones["Pelvis Sex"] = LabelEncoder().fit_transform(df_mf_right_bones["Pelvis Sex"])

print("Cleaned datasets loaded, shuffled, and encoded!")


Loaded Left Bones Dataset: (48, 38) | Right Bones Dataset: (48, 38)
Cleaned datasets loaded, shuffled, and encoded!


# Generate and Save Descriptive Statistics


In [None]:
# Define file names and corresponding DataFrame selections
tables_info = [
    ("Descriptive_Statistics_Left_Bones_Females.xlsx", df_mf_left_bones[df_mf_left_bones["Pelvis Sex"] == 0]),
    ("Descriptive_Statistics_Left_Bones_Males.xlsx", df_mf_left_bones[df_mf_left_bones["Pelvis Sex"] == 1]),
    ("Descriptive_Statistics_Right_Bones_Females.xlsx", df_mf_right_bones[df_mf_right_bones["Pelvis Sex"] == 0]),
    ("Descriptive_Statistics_Right_Bones_Males.xlsx", df_mf_right_bones[df_mf_right_bones["Pelvis Sex"] == 1])
]

# Loop through each case and save the descriptive statistics
for file_name, df in tables_info:
    stats = df.describe().transpose()
    stats.to_excel(file_name)
    print(f"Saved: {file_name}")


Saved: Descriptive_Statistics_Left_Bones_Females.xlsx
Saved: Descriptive_Statistics_Left_Bones_Males.xlsx
Saved: Descriptive_Statistics_Right_Bones_Females.xlsx
Saved: Descriptive_Statistics_Right_Bones_Males.xlsx


In [None]:
print("Missing Values Per Column - Left Bones Dataset:")
print(df_mf_left_bones.isna().sum())

print("\n Missing Values Per Column - Right Bones Dataset:")
print(df_mf_right_bones.isna().sum())


Missing Values Per Column - Left Bones Dataset:
Pelvis Sex                                            0
Clavicle maximum length                              31
Clavicle sagittal diameter at midshaft               30
Clavicle vertical diameter at midshaft               30
Scapula height                                       42
Scapula breadth                                      42
Humerus maximum length                               25
Humerus epicondylar breadth                          23
Humerus vertical diameter of head                    22
Humerus maximum diameter at midshaft                 25
Humerus minimum diameter at midshaft                 25
Radius maximum length                                26
Radius sagittal diameter at midshaft                 26
Radius transverse diameter at midshaft               26
Ulna maximum length                                  40
Ulna dorso-volar diameter                            22
Ulna transverse diameter                             22


# Imputation: Handling Missing Values

In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer, IterativeImputer

# Define Imputation Methods
knn_imputer = KNNImputer(n_neighbors=3)
iter_imputer = IterativeImputer(max_iter=1000, random_state=42)

# Identify independent variables (exclude "Pelvis Sex")
independent_vars = [col for col in df_mf_left_bones.columns if col != "Pelvis Sex"]

# Create dictionary for datasets
datasets = {
    "Original_L": df_mf_left_bones.copy(),
    "Original_R": df_mf_right_bones.copy(),
    "KNN_L": df_mf_left_bones.copy(),
    "KNN_R": df_mf_right_bones.copy(),
    "Iterative_L": df_mf_left_bones.copy(),
    "Iterative_R": df_mf_right_bones.copy(),
}

# Apply KNN & Iterative Imputation on the Original Datasets
datasets["KNN_L"][independent_vars] = knn_imputer.fit_transform(datasets["KNN_L"][independent_vars])
datasets["KNN_R"][independent_vars] = knn_imputer.fit_transform(datasets["KNN_R"][independent_vars])
datasets["Iterative_L"][independent_vars] = iter_imputer.fit_transform(datasets["Iterative_L"][independent_vars])
datasets["Iterative_R"][independent_vars] = iter_imputer.fit_transform(datasets["Iterative_R"][independent_vars])

print("\n Imputation Completed!")
print(f" Available datasets: {list(datasets.keys())}")
print("\n Missing Values After Imputation (Left - KNN):")
print(datasets["KNN_L"].isna().sum())
print("\n Missing Values After Imputation (Right - Iterative):")
print(datasets["Iterative_R"].isna().sum())



 Imputation Completed!
 Available datasets: ['Original_L', 'Original_R', 'KNN_L', 'KNN_R', 'Iterative_L', 'Iterative_R']

 Missing Values After Imputation (Left - KNN):
Pelvis Sex                                           0
Clavicle maximum length                              0
Clavicle sagittal diameter at midshaft               0
Clavicle vertical diameter at midshaft               0
Scapula height                                       0
Scapula breadth                                      0
Humerus maximum length                               0
Humerus epicondylar breadth                          0
Humerus vertical diameter of head                    0
Humerus maximum diameter at midshaft                 0
Humerus minimum diameter at midshaft                 0
Radius maximum length                                0
Radius sagittal diameter at midshaft                 0
Radius transverse diameter at midshaft               0
Ulna maximum length                                  0
Ulna 

# Wilcoxon Tests for Sex-Based Differences

In [None]:
import scipy.stats as stats

# Define output file paths
wilcoxon_results_file = "Wilcoxon_Test_Results.csv"
wilcoxon_results_by_bone_file = "Wilcoxon_Test_Results_By_Bone.csv"

# Initialize results storage
results = []
results_by_bone = {}

# Loop through all datasets and perform Wilcoxon test
for dataset_name, df in datasets.items():
    for var in independent_vars:
        # Split data into Male (1) and Female (0) groups
        males = df[df["Pelvis Sex"] == 1][var].dropna()
        females = df[df["Pelvis Sex"] == 0][var].dropna()

        num_males, num_females = len(males), len(females)

        # Ensure both groups have enough samples for Wilcoxon test
        if num_males > 1 and num_females > 1:
            try:
                stat, p_value = stats.mannwhitneyu(males, females, alternative="two-sided")
            except ValueError as e:
                print(f" Error for {var} in {dataset_name}: {e}")
                stat, p_value = None, None
        else:
            print(f" Skipping {var} in {dataset_name} (Insufficient samples)")
            stat, p_value = None, None

        # Store results in standard format (by dataset)
        results.append({
            "Dataset": dataset_name,
            "Variable": var,
            "Wilcoxon Statistic": stat,
            "p-value": p_value,
            "Num Males": num_males,
            "Num Females": num_females
        })

        # Store results in grouped format (by bone)
        if var not in results_by_bone:
            results_by_bone[var] = []
        results_by_bone[var].append({
            "Dataset": dataset_name,
            "Wilcoxon Statistic": stat,
            "p-value": p_value,
            "Num Males": num_males,
            "Num Females": num_females
        })

# Convert standard results to DataFrame and save
df_results = pd.DataFrame(results)
df_results.to_csv(wilcoxon_results_file, index=False)

print("\n Wilcoxon test results saved!")
print("\n Wilcoxon Test Results (Sample):")
print(df_results.head())

# Convert grouped results (by bone) to DataFrame
grouped_results = []
for var, data in results_by_bone.items():
    row = {"Variable": var}
    for entry in data:
        row[f"{entry['Dataset']} Wilcoxon Statistic"] = entry["Wilcoxon Statistic"]
        row[f"{entry['Dataset']} p-value"] = entry["p-value"]
        row[f"{entry['Dataset']} Num Males"] = entry["Num Males"]
        row[f"{entry['Dataset']} Num Females"] = entry["Num Females"]
    grouped_results.append(row)

df_results_by_bone = pd.DataFrame(grouped_results)

# Save the grouped results
df_results_by_bone.to_csv(wilcoxon_results_by_bone_file, index=False)

print("\n Wilcoxon test results grouped by bone saved!")
print("\n Sample of Wilcoxon results grouped by bone:")
print(df_results_by_bone.head())


 Skipping Scapula height in Original_R (Insufficient samples)
 Skipping Scapula breadth in Original_R (Insufficient samples)

 Wilcoxon test results saved!

 Wilcoxon Test Results (Sample):
      Dataset                                Variable  Wilcoxon Statistic  \
0  Original_L                 Clavicle maximum length                64.0   
1  Original_L  Clavicle sagittal diameter at midshaft                62.0   
2  Original_L  Clavicle vertical diameter at midshaft                73.0   
3  Original_L                          Scapula height                 8.0   
4  Original_L                         Scapula breadth                 8.0   

    p-value  Num Males  Num Females  
0  0.008024          8            9  
1  0.055711          8           10  
2  0.003862          8           10  
3  0.133333          4            2  
4  0.133333          4            2  

 Wilcoxon test results grouped by bone saved!

 Sample of Wilcoxon results grouped by bone:
                          

# Filter Wilcoxon Test Results (p > 0.04)

In [None]:
import pandas as pd

# Define the file path (Adjust if necessary)
file_path = "/content/Wilcoxon_Test_Results_By_Bone.csv"

# Load the Wilcoxon test results file
df = pd.read_csv(file_path)

# Create an empty list to store filtered results
filtered_results = []

# Iterate through the DataFrame and filter for p-values > 0.04
for index, row in df.iterrows():
    for col in df.columns:
        if "p-value" in col and row[col] > 0.04:
            variable_name = row["Variable"]
            dataset_used = col.replace(" p-value", "")
            filtered_results.append({"Variable": variable_name, "Dataset": dataset_used, "p-value": row[col]})

# Convert the filtered results into a DataFrame and sort alphabetically
df_filtered = pd.DataFrame(filtered_results).sort_values(by=["Variable", "Dataset"])

# Display the DataFrame
print(df_filtered)

# Optionally, save the filtered results to a new CSV file
df_filtered.to_csv("/content/Filtered_Wilcoxon_Results.csv", index=False)
print("Filtered results saved to /content/Filtered_Wilcoxon_Results.csv")


                                   Variable      Dataset   p-value
1                   Clavicle maximum length  Iterative_R  0.117525
0                   Clavicle maximum length   Original_R  0.051282
3    Clavicle sagittal diameter at midshaft  Iterative_R  0.062365
2    Clavicle sagittal diameter at midshaft   Original_L  0.055711
21  Femur sagittal subtrochanteric diameter  Iterative_L  0.088399
20  Femur sagittal subtrochanteric diameter   Original_L  0.073416
27      Fibula maximum diameter at midshaft  Iterative_L  0.436982
28      Fibula maximum diameter at midshaft  Iterative_R  0.044143
25      Fibula maximum diameter at midshaft   Original_L  0.797203
26      Fibula maximum diameter at midshaft   Original_R  0.301029
24                    Fibula maximum length  Iterative_L  0.048853
22                    Fibula maximum length   Original_L  0.606394
23                    Fibula maximum length   Original_R  0.211933
8      Humerus minimum diameter at midshaft   Original_L  0.04

# Define and Initialize Machine Learning Models

In [5]:
# Define Classifiers
classifiers = {
    "XGBoost": xgb.XGBClassifier(
        eval_metric="logloss",
        random_state=42,
        n_estimators=200,         # more trees (default is 100)
        learning_rate=0.05,       # slower but more precise learning
        max_depth=4,              # slightly deeper than default (3)
        subsample=0.9,            # helps generalization
        colsample_bytree=0.9,     # use 90% of features per tree
        reg_alpha=0.1,            # L1 regularization
        reg_lambda=1.0,           # L2 regularization
        scale_pos_weight=1.66     # to balance 30 males vs. 18 females
    ),

    "LightGBM": lgb.LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=4,
        min_child_samples=10,
        reg_alpha=0.1,
        reg_lambda=0.1,
        class_weight='balanced',
        random_state=42,
        verbose=-1
    ),

    "RandomForest": RandomForestClassifier(
        random_state=42
    ),

    "LogReg": LogisticRegression(
        max_iter=1000,
        random_state=42
    )
}

print("Models Defined!")



Models Defined!


# Set Up Cross-Validation Strategy

In [6]:
def get_cv_strategy(y):
    """
    Returns the appropriate cross-validation strategy:
    - LOOCV if Male/Female count < 5
    - 5-Fold Stratified CV otherwise
    """
    min_class_size = y.value_counts().min()

    if min_class_size < 5:
        return LeaveOneOut(), "Stratified LOOCV"
    else:
        return StratifiedKFold(n_splits=min(5, min_class_size), shuffle=True, random_state=42), "5-Fold Stratified CV"

print(" Cross-Validation Strategies Defined!")


 Cross-Validation Strategies Defined!


In [7]:
def run_classification(X, y, variable_name, model_name, dataset_name, cv_strategy, cv_type, results_df):
    model = classifiers[model_name]
    y_true, y_pred = [], []

    for train_idx, test_idx in cv_strategy.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if len(np.unique(y_train)) < 2:
            print(f" Skipping {variable_name} ({model_name}) - Only one class present in training data!")
            return results_df

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        y_true.extend(y_test)
        y_pred.extend(preds)

    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

    new_result = pd.DataFrame([{
        "Variable": variable_name,
        "Model": model_name,
        "Imputation": dataset_name,
        "CV Strategy": cv_type,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision M": report["0"]["precision"],
        "Precision F": report["1"]["precision"],
        "Recall M": report["0"]["recall"],
        "Recall F": report["1"]["recall"],
        "F1-score M": report["0"]["f1-score"],
        "F1-score F": report["1"]["f1-score"],
        "Sample size M": report["0"]["support"],
        "Sample size F": report["1"]["support"]
    }])

    results_df = pd.concat([results_df, new_result], ignore_index=True)

    print(f" Recorded results for {variable_name} using {model_name} ({cv_type})")
    return results_df



# Train and Evaluate Models for Individual Bone Measurements


In [None]:
import matplotlib.pyplot as plt
import os

# Define Folder for Plots
plots_folder = "/content/drive/MyDrive/Dion-results/Overfitting_Plots"
os.makedirs(plots_folder, exist_ok=True)

# Generate Timestamp for File Naming
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"/content/drive/MyDrive/Dion-results/Sex_Classification_{timestamp}.csv"

# Initialize Results DataFrame
results_df = pd.DataFrame(columns=["Variable", "Model", "Imputation", "CV Strategy", "Accuracy",
                                   "Precision M", "Precision F", "Recall M", "Recall F",
                                   "F1-score M", "F1-score F", "Sample size M", "Sample size F"])

# Debug: Ensure datasets exist
if not datasets:
    print(" ERROR: 'datasets' dictionary is empty! Check dataset loading.")

# Loop Over Each Dataset
for dataset_name, df in datasets.items():
    y = df["Pelvis Sex"]

    for variable in df.drop(columns=["Pelvis Sex"]).columns:
        X = df[[variable]]
        cv_strategy, cv_type = get_cv_strategy(y)

        for model_name in classifiers.keys():
            # Boosting classifiers run on all datasets, RF & LogReg only on KNN & Iterative
            if model_name in ["XGBoost", "LightGBM"] or dataset_name in ["KNN_L", "KNN_R", "Iterative_L", "Iterative_R"]:

                # Debug: Ensure the function is called
                print(f" Running classification for: {variable} ({model_name}, {dataset_name})")

                # Run classification and collect results
                results_df = run_classification(X, y, variable, model_name, dataset_name, cv_strategy, cv_type, results_df)

                # Debug: Check if results_df is being updated
                if results_df.empty:
                    print(f" ERROR: results_df is empty after processing {variable} ({model_name}, {dataset_name})")

                # Generate overfitting plots
                train_scores, test_scores = [], []

                for train_idx, test_idx in cv_strategy.split(X, y):
                    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

                    if len(np.unique(y_train)) < 2:
                        continue  # Skip iterations with only one class in training data

                    model = classifiers[model_name]
                    model.fit(X_train, y_train)

                    train_scores.append(model.score(X_train, y_train))
                    test_scores.append(model.score(X_test, y_test))

                # Plot Training vs. Test Scores
                plt.figure(figsize=(8, 6))
                plt.plot(range(len(train_scores)), train_scores, label="Train Score", marker="o", linestyle="-")
                plt.plot(range(len(test_scores)), test_scores, label="Test Score", marker="o", linestyle="--")
                plt.xlabel("Fold Number")
                plt.ylabel("Accuracy")
                plt.title(f"Overfitting Analysis - {variable} ({model_name}, {dataset_name})")
                plt.legend()
                plt.grid()

                # Save Plot
                plot_filename = f"{plots_folder}/{variable}_{model_name}_{dataset_name}.png"
                plt.savefig(plot_filename)
                plt.close()

# Debug: Ensure DataFrame is not empty before saving
if results_df.empty:
    print(" ERROR: results_df is empty! No results to save.")
else:
    # Save Results
    results_df.to_csv(results_file, index=False)
    print(f" Individual Variable Classification Results saved to {results_file}!")

# Debug: Verify File Exists
if os.path.exists(results_file):
    print(f" CSV file successfully created at: {results_file}")
else:
    print(" ERROR: CSV file was not created!")


 Running classification for: Clavicle maximum length (XGBoost, Original_L)
 Recorded results for Clavicle maximum length using XGBoost (5-Fold Stratified CV)


  results_df = pd.concat([results_df, new_result], ignore_index=True)


 Running classification for: Clavicle maximum length (LightGBM, Original_L)
 Recorded results for Clavicle maximum length using LightGBM (5-Fold Stratified CV)
 Running classification for: Clavicle sagittal diameter at midshaft (XGBoost, Original_L)
 Recorded results for Clavicle sagittal diameter at midshaft using XGBoost (5-Fold Stratified CV)
 Running classification for: Clavicle sagittal diameter at midshaft (LightGBM, Original_L)
 Recorded results for Clavicle sagittal diameter at midshaft using LightGBM (5-Fold Stratified CV)
 Running classification for: Clavicle vertical diameter at midshaft (XGBoost, Original_L)
 Recorded results for Clavicle vertical diameter at midshaft using XGBoost (5-Fold Stratified CV)
 Running classification for: Clavicle vertical diameter at midshaft (LightGBM, Original_L)
 Recorded results for Clavicle vertical diameter at midshaft using LightGBM (5-Fold Stratified CV)
 Running classification for: Scapula height (XGBoost, Original_L)
 Recorded results 

# Train and Evaluate Models for Bone Groups

In [None]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from datetime import datetime

# Define Bone Groups
bone_groups = {
    "Clavicle": ["Clavicle maximum length", "Clavicle sagittal diameter at midshaft", "Clavicle vertical diameter at midshaft"],
    "Scapula": ["Scapula height", "Scapula breadth"],
    "Humerus": ["Humerus maximum length", "Humerus epicondylar breadth", "Humerus vertical diameter of head",
                "Humerus maximum diameter at midshaft", "Humerus minimum diameter at midshaft"],
    "Radius": ["Radius maximum length", "Radius sagittal diameter at midshaft", "Radius transverse diameter at midshaft"],
    "Ulna": ["Ulna maximum length", "Ulna dorso-volar diameter", "Ulna transverse diameter",
             "Ulna physiological length", "Ulna minimum circumference"],
    "Femur": ["Femur maximum heigth", "Femur bicondylar length", "Femur epicondylar breadth",
              "Femur maximum head diameter", "Femur sagittal subtrochanteric diameter",
              "Femur transverse subtrochanteric diameter", "Femur sagittal midshaft diameter",
              "Femur transverse midshaft diameter", "Femur midshaft circumference"],
    "Tibia": ["Tibia length", "Tibia maximum proximal epiphyseal breadth", "Tibia maximum distal epiphyseal breadth",
              "Tibia maximum diameter at the nutrient foramen", "Tibia transverse diameter at the nutrient foramen",
              "Tibia circumference at the nutrient foramen"],
    "Fibula": ["Fibula maximum length", "Fibula maximum diameter at midshaft"],
    "Calcaneus": ["Calcaneus maximum length", "Calcaneus middle breadth"]
}

# Define Folder for Bone Group Plots
group_plots_folder = "/content/drive/MyDrive/Dion-results/Overfitting_Plots_BoneGroups"
os.makedirs(group_plots_folder, exist_ok=True)

# Generate Timestamp for File Naming
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
group_results_file = f"/content/drive/MyDrive/Dion-results/Bone_Groups_{timestamp}.csv"

# Initialize Results DataFrame

group_results_df = pd.DataFrame(columns=["Variable", "Model", "Imputation", "CV Strategy", "Accuracy",
                                   "Precision M", "Precision F", "Recall M", "Recall F",
                                   "F1-score M", "F1-score F", "Sample size M", "Sample size F"])

# Loop Over Each Dataset
for dataset_name, df in datasets.items():
    y = df["Pelvis Sex"]

    for group_name, variables in bone_groups.items():
        # Ensure only available variables are used
        valid_vars = [var for var in variables if var in df.columns]
        if not valid_vars:
            print(f" Skipping {group_name} ({dataset_name}) - No valid data!")
            continue

        X_group = df[valid_vars]  # DO NOT DROP NaNs, keep missing values

        cv_strategy, cv_type = get_cv_strategy(y)

        for model_name in classifiers.keys():
            # Ensure ALL datasets are used for Boosting Classifiers (XGBoost, LightGBM)
            if model_name in ["XGBoost", "LightGBM"]:
                print(f" Running {model_name} on {group_name} ({dataset_name}) - Keeping NaNs")

                # Run classification without dropping NaNs
                group_results_df = run_classification(X_group, y, group_name, model_name, dataset_name, cv_strategy, cv_type, group_results_df)

            # Keep regular processing for RandomForest and Logistic Regression
            elif dataset_name in ["KNN_L", "KNN_R", "Iterative_L", "Iterative_R"]:
                X_group_imputed = X_group.dropna()  # Drop NaNs only for these methods
                y_imputed = y.loc[X_group_imputed.index]

                if len(X_group_imputed) == 0:
                    print(f" Skipping {group_name} ({dataset_name}) - No valid samples after dropping NaNs!")
                    continue

                print(f" Running {model_name} on {group_name} ({dataset_name}) - Dropping NaNs")
                group_results_df = run_classification(X_group_imputed, y_imputed, group_name, model_name, dataset_name, cv_strategy, cv_type, group_results_df)

                # Generate Overfitting Analysis Plots
                train_scores, test_scores = [], []

                for train_idx, test_idx in cv_strategy.split(X_group_imputed, y_imputed):
                    X_train, X_test = X_group_imputed.iloc[train_idx], X_group_imputed.iloc[test_idx]
                    y_train, y_test = y_imputed.iloc[train_idx], y_imputed.iloc[test_idx]

                    if len(np.unique(y_train)) < 2:
                        continue  # Skip iterations with only one class in training data

                    model = classifiers[model_name]
                    model.fit(X_train, y_train)

                    train_scores.append(model.score(X_train, y_train))
                    test_scores.append(model.score(X_test, y_test))

                # Plot Training vs. Test Scores
                plt.figure(figsize=(8, 6))
                plt.plot(range(len(train_scores)), train_scores, label="Train Score", marker="o", linestyle="-")
                plt.plot(range(len(test_scores)), test_scores, label="Test Score", marker="o", linestyle="--")
                plt.xlabel("Fold Number")
                plt.ylabel("Accuracy")
                plt.title(f"Overfitting Analysis - {group_name} ({model_name}, {dataset_name})")
                plt.legend()
                plt.grid()

                # Save Plot
                plot_filename = f"{group_plots_folder}/{group_name}_{model_name}_{dataset_name}.png"
                plt.savefig(plot_filename)
                plt.close()

# Save Results
group_results_df.to_csv(group_results_file, index=False)
print(f" Bone Group Classification Results saved to {group_results_file}!")
print(f" Overfitting Analysis Plots for Bone Groups saved in {group_plots_folder}!")


 Running XGBoost on Clavicle (Original_L) - Keeping NaNs


  results_df = pd.concat([results_df, new_result], ignore_index=True)


 Recorded results for Clavicle using XGBoost (5-Fold Stratified CV)
 Running LightGBM on Clavicle (Original_L) - Keeping NaNs
 Recorded results for Clavicle using LightGBM (5-Fold Stratified CV)
 Running XGBoost on Scapula (Original_L) - Keeping NaNs
 Recorded results for Scapula using XGBoost (5-Fold Stratified CV)
 Running LightGBM on Scapula (Original_L) - Keeping NaNs
 Recorded results for Scapula using LightGBM (5-Fold Stratified CV)
 Running XGBoost on Humerus (Original_L) - Keeping NaNs
 Recorded results for Humerus using XGBoost (5-Fold Stratified CV)
 Running LightGBM on Humerus (Original_L) - Keeping NaNs
 Recorded results for Humerus using LightGBM (5-Fold Stratified CV)
 Running XGBoost on Radius (Original_L) - Keeping NaNs
 Recorded results for Radius using XGBoost (5-Fold Stratified CV)
 Running LightGBM on Radius (Original_L) - Keeping NaNs
 Recorded results for Radius using LightGBM (5-Fold Stratified CV)
 Running XGBoost on Ulna (Original_L) - Keeping NaNs
 Recorded re

# Identify the Best Models for Individual Variables

In [None]:
import pandas as pd

# Define file path for Sex Classification results

# Copy the output file from above
sex_classification_file = "/content/drive/MyDrive/Dion-results/Sex_Classification_20250327_125312.csv"

# Load the CSV file
df_sex = pd.read_csv(sex_classification_file)

# Extract unique variable names (remove L/R labels)
df_sex["Base Variable"] = df_sex["Variable"]

# Identify the best classifier for each variable (Left and Right separately)
top_models = []
for var in df_sex["Base Variable"].unique():
    df_var = df_sex[df_sex["Base Variable"] == var]

    # Best model for Left (L)
    df_L = df_var[df_var["Imputation"].str.contains("_L")]
    if not df_L.empty:
        top_L = df_L.loc[df_L["Accuracy"].idxmax()]
        top_models.append(top_L)

    # Best model for Right (R)
    df_R = df_var[df_var["Imputation"].str.contains("_R")]
    if not df_R.empty:
        top_R = df_R.loc[df_R["Accuracy"].idxmax()]
        top_models.append(top_R)

# Create DataFrame with the top classifiers
df_top_sex = pd.DataFrame(top_models)

# Sort to keep Left & Right variables together
df_top_sex = df_top_sex.sort_values(by=["Base Variable", "Imputation"])

# Drop unnecessary columns
df_top_sex = df_top_sex.drop(columns=["Base Variable"])

# Format floating-point numbers to two decimal places
float_cols = ["Accuracy", "Precision M", "Precision F", "Recall M", "Recall F", "F1-score M", "F1-score F"]
df_top_sex[float_cols] = df_top_sex[float_cols].applymap(lambda x: f"{x:.2f}")

# Display results
print(" Top classifiers for individual variables:")
print(df_top_sex)

# Save results
df_top_sex.to_csv("/content/Top_Individual_Variables.csv", index=False)

print(" Processed and saved top classifiers for individual variables.")

# Generate LaTeX table
latex_individual = df_top_sex.to_latex(index=False, column_format="lccccccccc",
                                       caption="Top Classifiers for Individual Variables (Left & Right)",
                                       label="tab:top_individual")

# Save LaTeX table
with open("/content/Top_Individual_Variables.tex", "w") as f:
    f.write(latex_individual)

print(" LaTeX table generated! Download 'Top_Individual_Variables.tex' and insert into Overleaf.")


 Top classifiers for individual variables:
                       Variable         Model   Imputation  \
584    Calcaneus maximum length       XGBoost  Iterative_L   
439    Calcaneus maximum length        LogReg        KNN_R   
295    Calcaneus middle breadth        LogReg        KNN_L   
147    Calcaneus middle breadth      LightGBM   Original_R   
151     Clavicle maximum length        LogReg        KNN_L   
..                          ...           ...          ...   
219  Ulna minimum circumference        LogReg        KNN_L   
509   Ulna physiological length      LightGBM  Iterative_L   
362   Ulna physiological length  RandomForest        KNN_R   
505    Ulna transverse diameter      LightGBM  Iterative_L   
359    Ulna transverse diameter        LogReg        KNN_R   

              CV Strategy Accuracy Precision M Precision F Recall M Recall F  \
584  5-Fold Stratified CV     0.92        0.91        0.94     0.97     0.83   
439  5-Fold Stratified CV     0.88        0.90      

  df_top_sex[float_cols] = df_top_sex[float_cols].applymap(lambda x: f"{x:.2f}")


# Identify the Best Models for Bone Groups

In [None]:
import pandas as pd

# Define file path for Bone Group Classification results

# Copy output file from above
bone_groups_file = "/content/drive/MyDrive/Dion-results/Bone_Groups_20250327_130942.csv"

# Load the CSV file
df_bone = pd.read_csv(bone_groups_file)

# Extract unique bone group names
df_bone["Base Variable"] = df_bone["Variable"]

# Identify the best classifier for each bone group (Left and Right separately)
top_models = []
for var in df_bone["Base Variable"].unique():
    df_var = df_bone[df_bone["Base Variable"] == var]

    # Best model for Left (L)
    df_L = df_var[df_var["Imputation"].str.contains("_L")]
    if not df_L.empty:
        top_L = df_L.loc[df_L["Accuracy"].idxmax()]
        top_models.append(top_L)

    # Best model for Right (R)
    df_R = df_var[df_var["Imputation"].str.contains("_R")]
    if not df_R.empty:
        top_R = df_R.loc[df_R["Accuracy"].idxmax()]
        top_models.append(top_R)

# Create DataFrame with the top classifiers
df_top_bone = pd.DataFrame(top_models)

# Sort to keep Left & Right bone groups together
df_top_bone = df_top_bone.sort_values(by=["Base Variable", "Imputation"])

# Drop unnecessary columns
df_top_bone = df_top_bone.drop(columns=["Base Variable"])

# Format floating-point numbers to two decimal places
df_top_bone[float_cols] = df_top_bone[float_cols].applymap(lambda x: f"{x:.2f}")

# Display results
print(" Top classifiers for bone groups:")
print(df_top_bone)

# Save results
df_top_bone.to_csv("/content/Top_Bone_Groups.csv", index=False)

print(" Processed and saved top classifiers for bone groups.")

# Generate LaTeX table
latex_bone = df_top_bone.to_latex(index=False, column_format="lccccccccc",
                                  caption="Top Classifiers for Bone Groups (Left & Right)",
                                  label="tab:top_bone_groups")

# Save LaTeX table
with open("/content/Top_Bone_Groups.tex", "w") as f:
    f.write(latex_bone)

print(" LaTeX table generated! Download 'Top_Bone_Groups.tex' and insert into Overleaf.")


 Top classifiers for bone groups:
      Variable         Model   Imputation           CV Strategy Accuracy  \
140  Calcaneus       XGBoost  Iterative_L  5-Fold Stratified CV     0.92   
107  Calcaneus        LogReg        KNN_R  5-Fold Stratified CV     0.90   
111   Clavicle        LogReg  Iterative_L  5-Fold Stratified CV     0.88   
146   Clavicle  RandomForest  Iterative_R  5-Fold Stratified CV     0.94   
56       Femur       XGBoost        KNN_L  5-Fold Stratified CV     0.98   
93       Femur      LightGBM        KNN_R  5-Fold Stratified CV     0.94   
175     Fibula        LogReg  Iterative_R  5-Fold Stratified CV     0.79   
67      Fibula        LogReg        KNN_L  5-Fold Stratified CV     0.77   
119    Humerus        LogReg  Iterative_L  5-Fold Stratified CV     0.92   
82     Humerus  RandomForest        KNN_R  5-Fold Stratified CV     0.92   
156     Radius       XGBoost  Iterative_R  5-Fold Stratified CV     0.94   
49      Radius      LightGBM        KNN_L  5-Fold Stra

  df_top_bone[float_cols] = df_top_bone[float_cols].applymap(lambda x: f"{x:.2f}")


# Train Logistic Regression Models on Bone Groups and Export to Text

In [None]:
# Train and Export Logistic Regression Models for Bone Groups

import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

# Output folder
logreg_model_folder = "/content/drive/MyDrive/Dion-results/Exported_LogReg_BoneGroups"
os.makedirs(logreg_model_folder, exist_ok=True)

# Use only imputed datasets
datasets_to_use = ["KNN_L", "KNN_R", "Iterative_L", "Iterative_R"]

# Bone group definitions
bone_groups = {
    "Clavicle": ["Clavicle maximum length", "Clavicle sagittal diameter at midshaft", "Clavicle vertical diameter at midshaft"],
    "Scapula": ["Scapula height", "Scapula breadth"],
    "Humerus": ["Humerus maximum length", "Humerus epicondylar breadth", "Humerus vertical diameter of head",
                "Humerus maximum diameter at midshaft", "Humerus minimum diameter at midshaft"],
    "Radius": ["Radius maximum length", "Radius sagittal diameter at midshaft", "Radius transverse diameter at midshaft"],
    "Ulna": ["Ulna maximum length", "Ulna dorso-volar diameter", "Ulna transverse diameter",
             "Ulna physiological length", "Ulna minimum circumference"],
    "Femur": ["Femur maximum heigth", "Femur bicondylar length", "Femur epicondylar breadth",
              "Femur maximum head diameter", "Femur sagittal subtrochanteric diameter",
              "Femur transverse subtrochanteric diameter", "Femur sagittal midshaft diameter",
              "Femur transverse midshaft diameter", "Femur midshaft circumference"],
    "Tibia": ["Tibia length", "Tibia maximum proximal epiphyseal breadth", "Tibia maximum distal epiphyseal breadth",
              "Tibia maximum diameter at the nutrient foramen", "Tibia transverse diameter at the nutrient foramen",
              "Tibia circumference at the nutrient foramen"],
    "Fibula": ["Fibula maximum length", "Fibula maximum diameter at midshaft"],
    "Calcaneus": ["Calcaneus maximum length", "Calcaneus middle breadth"]
}

# Loop through datasets and bone groups
for dataset_name, df in datasets.items():
    if dataset_name not in datasets_to_use:
        continue

    y = df["Pelvis Sex"]

    for group_name, features in bone_groups.items():
        valid_features = [f for f in features if f in df.columns]
        if not valid_features:
            print(f"Skipping {group_name} ({dataset_name}) - No valid features")
            continue

        X = df[valid_features].dropna()
        y_subset = y.loc[X.index]

        if len(X) < 2 or len(np.unique(y_subset)) < 2:
            print(f"Skipping {group_name} ({dataset_name}) - Not enough data or only one class")
            continue

        # Train on all available data
        model = LogisticRegression(max_iter=1000, random_state=42)
        model.fit(X, y_subset)

        intercept = model.intercept_[0]
        coefs = model.coef_[0]

        file_path = os.path.join(logreg_model_folder, f"LogisticRegression_{group_name}_{dataset_name}.txt")
        with open(file_path, "w") as f:
            f.write(f"Logistic Regression Model for {group_name} ({dataset_name})\n\n")
            f.write("Binary classification: 0 = Female, 1 = Male\n")
            f.write(f"Samples used: {len(X)}\n\n")

            f.write(f"Intercept: {intercept:.6f}\n\n")
            f.write("Coefficients:\n")
            for feature, coef in zip(valid_features, coefs):
                f.write(f"  {feature}: {coef:.6f}\n")

            f.write("\nFeature Order:\n")
            f.write(", ".join(valid_features) + "\n\n")

            f.write("Classification Rule:\n")
            f.write("Compute the score using your measurements:\n\n")
            formula = f"{intercept:.3f} + " + " + ".join([f"{coef:.3f} × {feat}" for coef, feat in zip(coefs, valid_features)])
            f.write(f"  Score = {formula}\n\n")
            f.write("Then apply the rule:\n")
            f.write("  If Score > 0 → Classify as Male\n")
            f.write("  If Score ≤ 0 → Classify as Female\n\n")

            f.write("Example (replace values with your measurements):\n")
            example = f"{intercept:.3f}"
            for coef, feat in zip(coefs, valid_features):
                example += f" + ({coef:.3f} × [your {feat}])"
            f.write(f"  Score = {example}\n")

        print(f"Exported: {file_path}")


Exported: /content/drive/MyDrive/Dion-results/Exported_LogReg_BoneGroups/LogisticRegression_Clavicle_KNN_L.txt
Exported: /content/drive/MyDrive/Dion-results/Exported_LogReg_BoneGroups/LogisticRegression_Scapula_KNN_L.txt
Exported: /content/drive/MyDrive/Dion-results/Exported_LogReg_BoneGroups/LogisticRegression_Humerus_KNN_L.txt
Exported: /content/drive/MyDrive/Dion-results/Exported_LogReg_BoneGroups/LogisticRegression_Radius_KNN_L.txt
Exported: /content/drive/MyDrive/Dion-results/Exported_LogReg_BoneGroups/LogisticRegression_Ulna_KNN_L.txt
Exported: /content/drive/MyDrive/Dion-results/Exported_LogReg_BoneGroups/LogisticRegression_Femur_KNN_L.txt
Exported: /content/drive/MyDrive/Dion-results/Exported_LogReg_BoneGroups/LogisticRegression_Tibia_KNN_L.txt
Exported: /content/drive/MyDrive/Dion-results/Exported_LogReg_BoneGroups/LogisticRegression_Fibula_KNN_L.txt
Exported: /content/drive/MyDrive/Dion-results/Exported_LogReg_BoneGroups/LogisticRegression_Calcaneus_KNN_L.txt
Exported: /conte

# Train and Export Boosted Models for Bone Groups (as Pickle Files)

In [11]:
# Train and Export XGBoost, LightGBM, and RandomForest Models for Bone Groups

import os
import joblib
import numpy as np
import pandas as pd

# Output folder
boosted_model_folder = "/content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels"
os.makedirs(boosted_model_folder, exist_ok=True)

# Datasets
datasets_rf = ["KNN_L", "KNN_R", "Iterative_L", "Iterative_R"]
datasets_boosting = ["Original_L", "Original_R", "KNN_L", "KNN_R", "Iterative_L", "Iterative_R"]

# Bone group definitions
bone_groups = {
    "Clavicle": ["Clavicle maximum length", "Clavicle sagittal diameter at midshaft", "Clavicle vertical diameter at midshaft"],
    "Scapula": ["Scapula height", "Scapula breadth"],
    "Humerus": ["Humerus maximum length", "Humerus epicondylar breadth", "Humerus vertical diameter of head",
                "Humerus maximum diameter at midshaft", "Humerus minimum diameter at midshaft"],
    "Radius": ["Radius maximum length", "Radius sagittal diameter at midshaft", "Radius transverse diameter at midshaft"],
    "Ulna": ["Ulna maximum length", "Ulna dorso-volar diameter", "Ulna transverse diameter",
             "Ulna physiological length", "Ulna minimum circumference"],
    "Femur": ["Femur maximum heigth", "Femur bicondylar length", "Femur epicondylar breadth",
              "Femur maximum head diameter", "Femur sagittal subtrochanteric diameter",
              "Femur transverse subtrochanteric diameter", "Femur sagittal midshaft diameter",
              "Femur transverse midshaft diameter", "Femur midshaft circumference"],
    "Tibia": ["Tibia length", "Tibia maximum proximal epiphyseal breadth", "Tibia maximum distal epiphyseal breadth",
              "Tibia maximum diameter at the nutrient foramen", "Tibia transverse diameter at the nutrient foramen",
              "Tibia circumference at the nutrient foramen"],
    "Fibula": ["Fibula maximum length", "Fibula maximum diameter at midshaft"],
    "Calcaneus": ["Calcaneus maximum length", "Calcaneus middle breadth"]
}

# Loop through datasets and train/export models
for dataset_name, df in datasets.items():
    y = df["Pelvis Sex"]

    for group_name, features in bone_groups.items():
        valid_features = [f for f in features if f in df.columns]
        if not valid_features:
            continue

        X = df[valid_features].dropna()
        y_subset = y.loc[X.index]

        if len(X) < 2 or len(np.unique(y_subset)) < 2:
            continue

        for model_name in ["XGBoost", "LightGBM", "RandomForest"]:
            if model_name == "RandomForest" and dataset_name not in datasets_rf:
                continue
            if model_name in ["XGBoost", "LightGBM"] and dataset_name not in datasets_boosting:
                continue

            model = classifiers[model_name]
            model.fit(X, y_subset)

            filename = f"{model_name}_{group_name}_{dataset_name}.pkl"
            file_path = os.path.join(boosted_model_folder, filename)
            joblib.dump(model, file_path)
            print(f"Exported: {file_path}")


Exported: /content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels/XGBoost_Clavicle_Original_L.pkl
Exported: /content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels/LightGBM_Clavicle_Original_L.pkl
Exported: /content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels/XGBoost_Scapula_Original_L.pkl
Exported: /content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels/LightGBM_Scapula_Original_L.pkl
Exported: /content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels/XGBoost_Humerus_Original_L.pkl
Exported: /content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels/LightGBM_Humerus_Original_L.pkl
Exported: /content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels/XGBoost_Radius_Original_L.pkl
Exported: /content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels/LightGBM_Radius_Original_L.pkl
Exported: /content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels/XGBoost_Ulna_Original_L.pkl
Exported: /content/drive/My

# Load and Test Pickled XGBoost, LightGBM, and RandomForest Models


In [15]:
import os
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Path to the pickled models
model_dir = "/content/drive/MyDrive/Dion-results/Exported_Pickled_BoostedModels"

# Load datasets again (already preprocessed in earlier cells)
left_bones_path = "/content/drive/MyDrive/Dion-data/Cleaned_Left_Bones.csv"
right_bones_path = "/content/drive/MyDrive/Dion-data/Cleaned_Right_Bones.csv"
df_mf_left_bones = pd.read_csv(left_bones_path)
df_mf_right_bones = pd.read_csv(right_bones_path)

df_mf_left_bones = df_mf_left_bones.sample(frac=1, random_state=42).reset_index(drop=True)
df_mf_right_bones = df_mf_right_bones.sample(frac=1, random_state=42).reset_index(drop=True)
df_mf_left_bones["Pelvis Sex"] = LabelEncoder().fit_transform(df_mf_left_bones["Pelvis Sex"])
df_mf_right_bones["Pelvis Sex"] = LabelEncoder().fit_transform(df_mf_right_bones["Pelvis Sex"])

# Load imputed datasets (KNN & Iterative)
from sklearn.impute import KNNImputer, IterativeImputer
independent_vars = [col for col in df_mf_left_bones.columns if col != "Pelvis Sex"]
knn_imputer = KNNImputer(n_neighbors=3)
iter_imputer = IterativeImputer(max_iter=1000, random_state=42)

datasets = {
    "Original_L": df_mf_left_bones.copy(),
    "Original_R": df_mf_right_bones.copy(),
    "KNN_L": df_mf_left_bones.copy(),
    "KNN_R": df_mf_right_bones.copy(),
    "Iterative_L": df_mf_left_bones.copy(),
    "Iterative_R": df_mf_right_bones.copy(),
}
datasets["KNN_L"][independent_vars] = knn_imputer.fit_transform(datasets["KNN_L"][independent_vars])
datasets["KNN_R"][independent_vars] = knn_imputer.fit_transform(datasets["KNN_R"][independent_vars])
datasets["Iterative_L"][independent_vars] = iter_imputer.fit_transform(datasets["Iterative_L"][independent_vars])
datasets["Iterative_R"][independent_vars] = iter_imputer.fit_transform(datasets["Iterative_R"][independent_vars])

# Bone group definitions
bone_groups = {
    "Clavicle": ["Clavicle maximum length", "Clavicle sagittal diameter at midshaft", "Clavicle vertical diameter at midshaft"],
    "Scapula": ["Scapula height", "Scapula breadth"],
    "Humerus": ["Humerus maximum length", "Humerus epicondylar breadth", "Humerus vertical diameter of head",
                "Humerus maximum diameter at midshaft", "Humerus minimum diameter at midshaft"],
    "Radius": ["Radius maximum length", "Radius sagittal diameter at midshaft", "Radius transverse diameter at midshaft"],
    "Ulna": ["Ulna maximum length", "Ulna dorso-volar diameter", "Ulna transverse diameter",
             "Ulna physiological length", "Ulna minimum circumference"],
    "Femur": ["Femur maximum heigth", "Femur bicondylar length", "Femur epicondylar breadth",
              "Femur maximum head diameter", "Femur sagittal subtrochanteric diameter",
              "Femur transverse subtrochanteric diameter", "Femur sagittal midshaft diameter",
              "Femur transverse midshaft diameter", "Femur midshaft circumference"],
    "Tibia": ["Tibia length", "Tibia maximum proximal epiphyseal breadth", "Tibia maximum distal epiphyseal breadth",
              "Tibia maximum diameter at the nutrient foramen", "Tibia transverse diameter at the nutrient foramen",
              "Tibia circumference at the nutrient foramen"],
    "Fibula": ["Fibula maximum length", "Fibula maximum diameter at midshaft"],
    "Calcaneus": ["Calcaneus maximum length", "Calcaneus middle breadth"]
}

# Evaluate all models
results = []

for filename in os.listdir(model_dir):
    if not filename.endswith(".pkl"):
        continue

    model_path = os.path.join(model_dir, filename)
    model = joblib.load(model_path)

    try:
        model_name, bone_group, dataset_name = filename.replace(".pkl", "").split("_", 2)
    except ValueError:
        print(f"Skipping invalid filename: {filename}")
        continue

    if dataset_name not in datasets:
        print(f"Dataset {dataset_name} not found for model {filename}")
        continue

    df = datasets[dataset_name]
    if bone_group not in bone_groups:
        print(f"Bone group {bone_group} not in defined groups for {filename}")
        continue

    features = [f for f in bone_groups[bone_group] if f in df.columns]
    if not features:
        print(f"No valid features in {filename}")
        continue

    X = df[features].dropna()
    y = df.loc[X.index, "Pelvis Sex"]

    if len(X) < 2 or len(np.unique(y)) < 2:
        continue

    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    results.append({
        "Model": model_name,
        "Bone Group": bone_group,
        "Dataset": dataset_name,
        "Accuracy": acc
    })

# Display summary
df_results = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)
print(df_results.sample(50))


            Model Bone Group      Dataset  Accuracy
81        XGBoost      Tibia        KNN_R  0.979167
44   RandomForest    Humerus        KNN_L  1.000000
62   RandomForest  Calcaneus        KNN_L  1.000000
79       LightGBM      Femur        KNN_R  1.000000
128  RandomForest     Radius  Iterative_R  1.000000
66        XGBoost    Scapula        KNN_R  0.625000
63        XGBoost   Clavicle        KNN_R  0.958333
35       LightGBM  Calcaneus   Original_R  0.875000
125  RandomForest    Humerus  Iterative_R  1.000000
18        XGBoost   Clavicle   Original_R  1.000000
45        XGBoost     Radius        KNN_L  0.979167
65   RandomForest   Clavicle        KNN_R  0.979167
95   RandomForest    Scapula  Iterative_L  1.000000
85       LightGBM     Fibula        KNN_R  0.854167
51        XGBoost      Femur        KNN_L  1.000000
13       LightGBM      Tibia   Original_L  0.571429
38   RandomForest   Clavicle        KNN_L  1.000000
57        XGBoost     Fibula        KNN_L  0.937500
143  RandomF