## Environment

In [None]:
%pip install numpy
%pip install matplotlib
%pip install scikit-learn
%pip install pandas
%pip install imbalanced-learn
%pip install xgboost
%pip install onnxmltools
%pip install skl2onnx

In [2]:
# Libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from xgboost import plot_importance,XGBClassifier
import xgboost as xgb
import onnxmltools
from skl2onnx.common.data_types import FloatTensorType


# Generate synthetic data


In [None]:
# Create function to generate data for clinical analysis
def generate_obesity_data_v2(n_samples=1000):
    # Variables (features) for the dataset
    bmi = np.random.normal(32, 5, n_samples).clip(18.5, 50) # Body Mass Index (BMI) (kg/m^2) (normal range: 18.5 - 24.9)
    ldl = np.random.normal(140 + (bmi - 25) * 2, 30, n_samples).clip(50, 250) # Low Density Lipoprotein (LDL) (mg/dL) (normal range: < 100)
    triglycerides = np.random.normal(180 + (bmi - 25) * 3, 50, n_samples).clip(50, 500) # Triglycerides (mg/dL) (normal range: < 150)
    glucose = np.random.choice([np.nan, *np.random.normal(110, 15, n_samples).clip(70, 200)], n_samples) # Glucose (mg/dL) (normal range: 70 - 100)
    hb1ac = np.random.choice([np.nan, *np.random.normal(5.5, 1, n_samples).clip(4, 10)], n_samples) # Hemoglobin A1c (%) (normal range: < 5.7)
    
    # Parámetros adicionales (opcionales)
    pcr = np.random.choice([np.nan, *np.random.normal(1.5, 0.5, n_samples).clip(0.5, 10)], n_samples) # Reactive Protein C (PCR) (mg/dL) (normal range: < 1)
    insuline = np.random.choice([np.nan, *np.random.normal(15, 5, n_samples).clip(2, 50)], n_samples)  # Insuline (uU/mL) (normal range: 2 - 25)
    ast = np.random.normal(20, 10, n_samples).clip(0, 100)  # Hepatic enzymes (AST) (normal range: < 40)
    alt = np.random.normal(25, 10, n_samples).clip(0, 100)  # Hepatic enzymes (ALT) (normal range: < 40)
    leptin = np.random.choice([np.nan, *np.random.normal(25, 5, n_samples).clip(10, 50)], n_samples)  # Leptin (ng/mL) (normal range: 10 - 25)

    # Add noise to the label
    noise = np.random.binomial(1, 0.00, n_samples)  # 5% of noise

    # Label based on obesity criteria (BMI > 30, LDL > 160, Triglycerides > 200)
    label = ((bmi > 30) & (ldl > 160) & (triglycerides > 200)).astype(int) # 1: Obese, 0: Not obese
    label = (label + noise) % 2  # Add noise

    return pd.DataFrame({
        "bmi": bmi,
        "ldl": ldl,
        "triglycerides": triglycerides,
        "glucose": glucose,
        "hb1ac": hb1ac,
        "pcr": pcr,
        "insuline": insuline,
        "ast": ast,
        "alt": alt,
        "leptin": leptin,
        "label": label
    }) # Return the dataset as a DataFrame object (Pandas)

df_obesity = generate_obesity_data_v2(n_samples=1000) # Generate the dataset with 1000 samples
df_obesity.head() # Show the first 5 samples of the dataset (DataFrame)

# Imputation for missing data

In [4]:

imputer = SimpleImputer(strategy="mean") # Create the imputer object (mean strategy) 
X = df_obesity.drop(columns="label")   # Features (X) 
X_imputed = imputer.fit_transform(X) # Impute the missing values in the features (X) 
y = df_obesity["label"]  # Label (y) 

# Create the fine-tunning file for Open AI

In [None]:
# Crear el archivo JSONL para el fine-tuning
def prepare_jsonl(df, output_file):
    """
    Convierte un DataFrame a un archivo JSONL compatible con OpenAI para fine-tuning.
    """
    with open(output_file, "w") as f:
        for _, row in df.iterrows():
            # Creates the prompt
            prompt = (
                f"Valores clínicos:\n"
                f"BMI: {row['bmi']}, LDL: {row['ldl']}, Triglicéridos: {row['triglycerides']}, "
                f"Glucosa: {row['glucose']}, HbA1c: {row['hb1ac']}, "
                f"PCR: {row['pcr']}, Insulina: {row['insuline']}, AST: {row['ast']}, ALT: {row['alt']}, Leptina: {row['leptin']}.\n"
                f"¿Tiene obesidad?"
            )
            # Create the completion (answer)
            completion = " Sí" if row["label"] == 1 else " No"
            # Write the JSON line to the file
            json_line = {"prompt": prompt, "completion": completion}
            f.write(f"{json_line}\n")

# Generate the JSONL file for fine-tuning
output_file = "obesity_finetuning_data.jsonl"
prepare_jsonl(df_obesity, output_file)
print(f"Archivo JSONL generado: {output_file}")

# Generate the JSONL file for fine-tuning
output_file = "obesity_finetuning_data.jsonl"
prepare_jsonl(df_obesity, output_file)
print(f"Archivo JSONL generado: {output_file}")

# Model training

In [None]:


# Split the dataset into training and testing sets (80% training, 20% testing) 
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42) 
X_train_renamed = pd.DataFrame(X_train, columns=[f"f{i}" for i in range(X_train.shape[1])]) # Rename the columns of the training set (X_train)


# Calculate the scale_pos_weight parameter for XGBoost 
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Create the XGBoost model 
xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, eval_metric="logloss")
# Train the model 
xgb_model.fit(X_train_renamed, y_train)

# Plot the feature importance (Evaluation of the model) 
y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb)) # Accuracy of the model (before optimization) 
print(classification_report(y_test, y_pred_xgb)) # Classification report of the model (before optimization)

# Data visualization

In [None]:
# Plot the feature importance (Evaluation of the model)
xgb_model.get_booster().feature_names = df_obesity.drop(columns="label").columns.tolist()

plt.figure(figsize=(10, 8)) # Create the figure 
plot_importance(xgb_model, importance_type="weight") # Plot the feature importance (weight)
plt.title("Importancia de las características - XGBoost") # Title of the plot
plt.show() # Show the plot

# Model optimization

In [8]:


# Convert the XGBoost model to ONNX format
dtrain = xgb.DMatrix(X_train, label=y_train)

# Test dataset (X_test, y_test)
dtest = xgb.DMatrix(X_test, label=y_test)


# Model parameters (XGBoost)
params = {
    "objective": "binary:logistic",
    "scale_pos_weight": scale_pos_weight,
    "eval_metric": "logloss",
    "max_depth": 5,
    "eta": 0.1,
    "subsample": 0.8,
}

# Training the model (XGBoost)
bst = xgb.train(params, dtrain, num_boost_round=100)

# Prediction (XGBoost)
y_pred = bst.predict(dtest)
y_pred_binary = (y_pred > 0.5).astype(int)


# Evaluation of accuracy

In [None]:
# Evaluation of the model (XGBoost) - Test dataset (X_test, y_test) (after optimization)
print("Accuracy:", accuracy_score(y_test, y_pred_binary))
# Classification report of the model (after optimization) 
print(classification_report(y_test, y_pred_binary))
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "eta": 0.1,
    "max_depth": 5,
    "subsample": 0.8
} # Model parameters (XGBoost)

# Cross-validation (XGBoost) - 5 folds
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=100,
    nfold=5, 
    metrics=["logloss"], 
    as_pandas=True,
    seed=42
) # Results of the cross-validation (XGBoost)

# Show the results of the cross-validation (XGBoost) 
print(cv_results)
# Show the mean logloss of the cross-validation (XGBoost)
print("Mean Logloss:", cv_results["test-logloss-mean"].min())

# Save model

In [None]:



# Set feature names to match the DataFrame columns
xgb_model.get_booster().feature_names = X_train_renamed.columns.tolist()

# Define the initial type of the model
initial_type = [("float_input", FloatTensorType([None, X_train_renamed.shape[1]]))]

# Convert the XGBoost model to ONNX
onnx_model = onnxmltools.convert_xgboost(xgb_model, initial_types=initial_type)

# Save the ONNX model
with open("xgboost_obesity_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

# Print the message
print("Modelo ONNX generado correctamente.")