# 1. Data Preparation

#### A. Physical Activity and Sedentary Data
Atleast 3 valid days with wear time ≥ 600 mins are required for each subject to be included in the summary. The summary includes the mean MVPA and sedentary time for each subject across their valid days.

In [1]:
import pandas as pd

# Load the data
df = pd.read_excel("raw_databases/actigraph_raw.xlsx", sheet_name="in")

# Step 1: Remove rows where Calendar Days == 0
df = df[df["Calendar Days"] != 0]

# Step 2: Keep only rows with valid wear time (Time ≥ 600 mins)
df_valid = df[df["Time"] >= 600]

# Step 3: Identify subjects with at least 3 valid days
valid_subjects = (
    df_valid.groupby("id")["Date"]
    .count()
    .reset_index(name="valid_days")
    .query("valid_days >= 3")["id"]
)

# Step 4: Filter again to include only these valid subjects
df_valid_filtered = df_valid[df_valid["id"].isin(valid_subjects)]

# Step 5: Compute average MVPA and sedentary time across valid days
df_summary = df_valid_filtered.groupby("id").agg(
    mean_mvpa=("Total MVPA", "mean"),
    mean_sedentary=("Sedentary", "mean"),
    valid_days=("Date", "count")
).reset_index()

# Optional: Save or view result
df_summary.to_csv("mvpa_sedentary_summary.csv", index=False)
print(df_summary.head())

   id   mean_mvpa  mean_sedentary  valid_days
0   2  484.027778      812.777778           6
1   3  473.333333      824.261905           7
2   4  443.380952      867.761905           7
3   7  397.125000      502.375000           4
4   9  478.571429      785.309524           7


#### B. Sleep Data
Similar to physical activity, atleast 3 days of valid data. Averaged sleep fragmentation across valid days. Sleep fragmentation done as in : https://doi.org/10.1002/oby.23754

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("raw_databases/sleep.csv")

# Group by 'Subject Name' and calculate the average Sleep Fragmentation Index
sleep_summary = df.groupby('Subject Name')['Sleep Fragmentation Index'].mean().reset_index()

# Rename the columns as required
sleep_summary.columns = ['id', 'avg_sleep_frag_index']

# Save the result to a new CSV file
sleep_summary.to_csv("sleep_summary.csv", index=False)

#### C. Meal Microstructure
Microstructure coded from protocol: https://doi.org/10.5281/zenodo.8140895

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Load your actual dataset
df = pd.read_csv("raw_databases/micro_beh_summary.csv")

# Ensure numeric columns
df["ps"] = pd.to_numeric(df["ps"], errors="coerce")
df["id"] = pd.to_numeric(df["id"], errors="coerce")
df = df.dropna(subset=["id", "ps"])

# Use fallback logic where needed
df["total_active_eating"] = pd.to_numeric(df["total_active_eating_c1"], errors="coerce").combine_first(
    pd.to_numeric(df["total_active_eating_c2"], errors="coerce"))
df["meal_duration"] = pd.to_numeric(df["meal_duration_c1"], errors="coerce").combine_first(
    pd.to_numeric(df["meal_duration_c2"], errors="coerce"))
df["bite_rate"] = pd.to_numeric(df["bite_rate_c1"], errors="coerce").combine_first(
    pd.to_numeric(df["bite_rate_active_c1"], errors="coerce")).combine_first(
    pd.to_numeric(df["bite_rate_active_c2"], errors="coerce"))

# Compute % active eating time
df["percent_active_eating"] = df["total_active_eating"] / df["meal_duration"]

# Prepare result list
results = []

for child_id, group in df.groupby("id"):
    entry = {"id": child_id}
    
    # Mean bite rate: keep if at least one value
    bite_rate_vals = group["bite_rate"].dropna()
    if len(bite_rate_vals) >= 1:
        entry["bite_rate_mean"] = bite_rate_vals.mean()
    
    # Slope & intercept of % active eating time: require ≥2 PS values
    X = group["ps"].values.reshape(-1, 1)
    y = group["percent_active_eating"].values
    valid = np.isfinite(X.ravel()) & np.isfinite(y)

    if valid.sum() >= 2:
        model = LinearRegression().fit(X[valid], y[valid])
        entry["percent_active_eating_slope"] = model.coef_[0]
        entry["percent_active_eating_intercept"] = model.intercept_
    else:
        # skip slope/intercept if not enough points
        continue

    results.append(entry)

# Output summary
summary_df = pd.DataFrame(results)
summary_df.to_excel("microstructure_summary.xlsx", index=False)
print("Saved as: microstructure_summary.xlsx")

Saved as: microstructure_summary.xlsx


#### D. Bite Size
Bite size calculated as intake(grams) at each portion size / number of bites at respective portion

In [4]:
import pandas as pd

# Load the datasets
gram_data = pd.read_csv("raw_databases/intake_data.csv")
bites_data = pd.read_csv("raw_databases/micro_beh_summary.csv")

# Step 1: Melt gram data from wide to long format
gram_long = pd.melt(
    gram_data,
    id_vars=["id"],
    value_vars=["ps1_total_g", "ps2_total_g", "ps3_total_g", "ps4_total_g"],
    var_name="ps_label",
    value_name="gram"
)

# Step 2: Extract PS number
gram_long["ps"] = gram_long["ps_label"].str.extract(r"ps(\d+)_total_g").astype(int)
gram_long.drop(columns="ps_label", inplace=True)

# Step 3: Use preferred nbites column
bites_data["nbites"] = bites_data["nbites_c1"].combine_first(bites_data["nbites_c2"])

# Step 4: Clean and convert 'ps' column in bites_data
bites_data["ps"] = pd.to_numeric(bites_data["ps"], errors='coerce')
bites_data = bites_data.dropna(subset=["ps"])
bites_data["ps"] = bites_data["ps"].astype(int)

# Step 5: Merge gram and bites data
merged = gram_long.merge(
    bites_data[["id", "ps", "nbites"]],
    on=["id", "ps"],
    how="inner"
)

# Step 6: Clean data and calculate bite size
merged["nbites"] = pd.to_numeric(merged["nbites"], errors="coerce")
merged["gram"] = pd.to_numeric(merged["gram"], errors="coerce")
merged = merged[merged["nbites"].notna() & (merged["nbites"] != 0)]
merged = merged[merged["gram"].notna()]
merged["bite_size_gram"] = merged["gram"] / merged["nbites"]

# Step 7: Keep relevant columns
result = merged[["id", "ps", "gram", "nbites", "bite_size_gram"]]

# Output
print(result.head())
result.to_csv("matched_gram_bites_bitesize.csv", index=False)

   id  ps    gram  nbites  bite_size_gram
0   1   1  615.80     117        5.263248
1   2   1  311.32      51        6.104314
2   3   1  382.28      68        5.621765
3   4   1  689.83      92        7.498152
4   5   1  657.45      75        8.766000


##### Finding slopes and intercept for bite size data
Slope and intercepts calculated per subject across portion sizes : served portion sizes vs bite size at each portion size. Bite size increases with portions, and therefore slope and intercept captures that change with portion sizes

In [5]:
import pandas as pd
from scipy.stats import linregress

# Load the previously saved bite size data (in grams)
data = pd.read_csv("matched_gram_bites_bitesize.csv")

# Ensure relevant columns are numeric
data["ps"] = pd.to_numeric(data["ps"], errors="coerce")
data["bite_size_gram"] = pd.to_numeric(data["bite_size_gram"], errors="coerce")

# Drop rows with missing values in ps or bite_size_gram
data = data.dropna(subset=["ps", "bite_size_gram"])

# Function to compute bite size slope and intercept per ID
def compute_slope_intercept(group):
    if group["ps"].nunique() >= 2:
        slope, intercept, *_ = linregress(group["ps"], group["bite_size_gram"])
        return pd.Series({"bite_size_slope": slope, "bite_size_int": intercept})
    else:
        return pd.Series({"bite_size_slope": None, "bite_size_int": None})

# Group by ID and apply regression
regression_results = data.groupby("id").apply(compute_slope_intercept).reset_index()

# Output
print(regression_results.head())
regression_results.to_csv("bite_size_regression_per_id.csv", index=False)

   id  bite_size_slope  bite_size_int
0   1        -0.207803       5.471051
1   2        -0.642434       6.605830
2   3        -0.401121       6.215653
3   4        -0.353493       6.920628
4   5        -0.346405       8.848133


#### E. Switching Data Processing
Switching calculated as per: https://doi.org/10.1016/j.physbeh.2023.114312
Averaged across all portion sizes since it is a stable predictor

In [6]:
import pandas as pd

# Load the data
data = pd.read_csv("raw_databases/Switching database.csv")

# List of PS columns to average
ps_columns = [
    "ps1_fswitch_nok",
    "ps2_fswitch_nok",
    "ps3_fswitch_nok",
    "ps4_fswitch_nok"
]

# Ensure numeric and handle any non-numeric entries gracefully
for col in ps_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Calculate mean fswitch_nok per ID
data["fswitch_nok_mean"] = data[ps_columns].mean(axis=1)

# Keep only relevant columns
result = data[["id", "fswitch_nok_mean"]]

# Output
print(result.head())
result.to_csv("fswitch_nok_mean_per_id.csv", index=False)

   id  fswitch_nok_mean
0   1              9.50
1   2             20.00
2   3             11.25
3   4             16.25
4   5             26.50


####  F. Portion Size Response
Slope and intercepts calculated per subject across portion sizes : served portion sizes vs intake at each portion size. 

In [7]:
import pandas as pd

# Load the Excel file
data = pd.read_excel("raw_databases/PS.xlsx")

# Define columns for each PS
served_cols = {
    "ps1_served_g": [
        "ps1_noplate_chkn_nug_g",
        "ps1_noplate_mac_cheese_g",
        "ps1_noplate_grapes_g",
        "ps1_noplate_broccoli_g",
        "ps1_noplate_ketchup_g"
    ],
    "ps2_served_g": [
        "ps2_noplate_chkn_nug_g",
        "ps2_noplate_mac_cheese_g",
        "ps2_noplate_grapes_g",
        "ps2_noplate_broccoli_g",
        "ps2_noplate_ketchup_g"
    ],
    "ps3_served_g": [
        "ps3_noplate_chkn_nug_g",
        "ps3_noplate_mac_cheese_g",
        "ps3_noplate_grapes_g",
        "ps3_noplate_broccoli_g",
        "ps3_noplate_ketchup_g"
    ],
    "ps4_served_g": [
        "ps4_noplate_chkn_nug_g",
        "ps4_noplate_mac_cheese_g",
        "ps4_noplate_grapes_g",
        "ps4_noplate_broccoli_g",
        "ps4_noplate_ketchup_g"
    ]
}

consumed_cols = {
    "ps1_consumed_g": [
        "ps1_consumed_chkn_nug_g",
        "ps1_consumed_mac_cheese_g",
        "ps1_consumed_grapes_g",
        "ps1_consumed_broccoli_g",
        "ps1_consumed_ketchup_g"
    ],
    "ps2_consumed_g": [
        "ps2_consumed_chkn_nug_g",
        "ps2_consumed_mac_cheese_g",
        "ps2_consumed_grapes_g",
        "ps2_consumed_broccoli_g",
        "ps2_consumed_ketchup_g"
    ],
    "ps3_consumed_g": [
        "ps3_consumed_chkn_nug_g",
        "ps3_consumed_mac_cheese_g",
        "ps3_consumed_grapes_g",
        "ps3_consumed_broccoli_g",
        "ps3_consumed_ketchup_g"
    ],
    "ps4_consumed_g": [
        "ps4_consumed_chkn_nug_g",
        "ps4_consumed_mac_cheese_g",
        "ps4_consumed_grapes_g",
        "ps4_consumed_broccoli_g",
        "ps4_consumed_ketchup_g"
    ]
}

# Compute served totals with numeric conversion
for new_col, cols in served_cols.items():
    data[new_col] = data[cols].apply(pd.to_numeric, errors="coerce").sum(axis=1)

# Compute consumed totals with numeric conversion
for new_col, cols in consumed_cols.items():
    data[new_col] = data[cols].apply(pd.to_numeric, errors="coerce").sum(axis=1)

# Select and reorder final columns
final_cols = ["id"] + list(served_cols.keys()) + list(consumed_cols.keys())
final_data = data[final_cols]

# Save to Excel
final_data.to_excel("portion_weights_summary.xlsx", index=False)

# Show preview
print(final_data.head())

   id  ps1_served_g  ps2_served_g  ps3_served_g  ps4_served_g  ps1_consumed_g  \
0   1        809.31       1047.75       1279.18       1522.45          441.42   
1   2        802.58       1046.71       1271.43       1520.89          245.14   
2   3        799.44       1043.93       1140.36       1481.64          291.89   
3   4        799.87       1043.29       1265.84       1509.93          425.34   
4   5        790.91       1038.25       1284.20       1512.65          591.18   

   ps2_consumed_g  ps3_consumed_g  ps4_consumed_g  
0          412.27          413.48          479.69  
1          384.69          221.97          360.29  
2          476.48          404.43          387.07  
3          626.82          734.18          692.78  
4          663.34          666.57          869.58  


In [8]:
import pandas as pd
from scipy.stats import linregress

# Load previously saved portion weight summary
data = pd.read_excel("portion_weights_summary.xlsx")

# Columns to extract and match
served_cols = ["ps1_served_g", "ps2_served_g", "ps3_served_g", "ps4_served_g"]
consumed_cols = ["ps1_consumed_g", "ps2_consumed_g", "ps3_consumed_g", "ps4_consumed_g"]

# Ensure all relevant columns are numeric
for col in served_cols + consumed_cols:
    data[col] = pd.to_numeric(data[col], errors="coerce")

# Function to compute slope and intercept per child
def compute_ps_response(row):
    x = row[served_cols].values
    y = row[consumed_cols].values

    # Filter out missing pairs
    valid = ~(pd.isna(x) | pd.isna(y))
    x_valid = x[valid]
    y_valid = y[valid]

    if len(x_valid) >= 2:
        slope, intercept, *_ = linregress(x_valid, y_valid)
        return pd.Series({"ps_response_slope": slope, "ps_response_int": intercept})
    else:
        return pd.Series({"ps_response_slope": None, "ps_response_int": None})

# Apply row-wise
response_results = data[["id"] + served_cols + consumed_cols].copy()
response_results[["ps_response_slope", "ps_response_int"]] = response_results.apply(compute_ps_response, axis=1)

# Keep only ID and regression outputs
final_results = response_results[["id", "ps_response_slope", "ps_response_int"]].dropna()

# Save to Excel
final_results.to_excel("ps_response_slopes.xlsx", index=False)

# Preview
print(final_results.head())

   id  ps_response_slope  ps_response_int
0   1           0.049458       379.112909
1   2           0.081456       208.500184
2   3           0.100569       277.698276
3   4           0.385958       174.102320
4   5           0.344736       298.979595


#### Merging Database
Combining all predictors and data across all the above

In [9]:
import pandas as pd
from functools import reduce

# ---------- Load individual datasets ----------

# Helper function to load and cast 'id' to str
def load_and_cast(filepath, usecols, filetype="csv"):
    if filetype == "csv":
        df = pd.read_csv(filepath, usecols=usecols)
    else:
        df = pd.read_excel(filepath, usecols=usecols)
    df["id"] = df["id"].astype(str)
    return df

# 1. MVPA and sedentary data
mvpa = load_and_cast("mvpa_sedentary_summary.csv", ["id", "mean_mvpa", "mean_sedentary"])

# 2. Sleep fragmentation
sleep = load_and_cast("sleep_summary.csv", ["id", "avg_sleep_frag_index"])

# 3. Microstructure summary
microstructure = load_and_cast("microstructure_summary.xlsx", ["id", "percent_active_eating_slope", "percent_active_eating_intercept"], filetype="excel")

# 4. Bite size regression
bite_size = load_and_cast("bite_size_regression_per_id.csv", ["id", "bite_size_slope", "bite_size_int"])

# 5. FSwitch mean
fswitch = load_and_cast("fswitch_nok_mean_per_id.csv", ["id", "fswitch_nok_mean"])

# 6. PS response slopes
ps_response = load_and_cast("ps_response_slopes.xlsx", ["id", "ps_response_slope", "ps_response_int"], filetype="excel")

# 7. Anthropometry
anthro = load_and_cast("raw_databases/anthro_data.csv", ["id", "risk_status_mom", "sex", "age_yr", "parent_ed", "income", "bmi_percentile", "v7_bmi_percentile"])

# 8. Demographics
demo = load_and_cast("raw_databases/demographics_data.csv", ["id", "pds_score", "pds_tanner_cat"])

# 9. Intake
intake = load_and_cast("raw_databases/intake_data.csv", ["id", "v1_meal_total_kcal", "v1_eah_total_kcal"])

# 10. Behavioral questionnaires
qs = pd.read_csv("raw_databases/qs_eatbeh_bodyimage.csv")
qs["id"] = qs["id"].astype(str)
qs_cols = [
    "id",
    # CEBQ
    "cebq_fr", "cebq_eoe", "cebq_ef", "cebq_dd", "cebq_sr", "cebq_se", "cebq_eue", "cebq_ff", "cebq_approach", "cebq_avoid",
    # CFQ
    "cfq_resp", "cfq_pcw", "cfq_ppw", "cfq_cwc", "cfq_rest", "cfq_pressure", "cfq_mon",
    # FFBS
    "ffbs_control", "ffbs_presence", "ffbs_ch_choice", "ffbs_org",
    # PWLB
    "pwlb_healthy", "pwlb_unhealthy",
    # TFEQ
    "tfeq_cogcontrol", "tfeq_disinhibition", "tfeq_hunger"
]
qs_selected = qs[qs_cols]

# 11. Additional cognitive/psychosocial variables
cog_psych = load_and_cast("raw_databases/qs_cog_psych_soc.csv", [
    "id",
    "bas_funseeking", "bas_drive", "bas_rewardresp", "bis",
    "brief2_gec_p"
])

# ---------- Merge all datasets ----------
dfs = [
    anthro, mvpa, sleep, microstructure, bite_size, fswitch, ps_response,
    demo, intake, qs_selected, cog_psych
]

merged_df = reduce(lambda left, right: pd.merge(left, right, on="id", how="left"), dfs)

# ---------- Ensure v7_bmi_percentile is numeric ----------
merged_df["v7_bmi_percentile"] = pd.to_numeric(merged_df["v7_bmi_percentile"], errors="coerce")

# ---------- Filter rows with valid v7_bmi_percentile ----------
final_df = merged_df[merged_df["v7_bmi_percentile"].notna()].reset_index(drop=True)

# ---------- Save merged file ----------
final_df.to_csv("merged_dataset.csv", index=False)
print("✅ Merged dataset saved as 'merged_dataset.csv' with shape:", final_df.shape)

✅ Merged dataset saved as 'merged_dataset.csv' with shape: (76, 53)


# 2.Synthesizing Dataset
Synthesizing pseduo-real-world dataset from current dataset - to increase sample size for tutorial

In [10]:
import pandas as pd
import numpy as np
import os
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from sklearn.impute import SimpleImputer
from sklearn.utils import resample

# --- Load dataset and clean blanks ---
df = pd.read_csv("merged_dataset.csv")
df = df.replace(r"^\s*$", np.nan, regex=True)

# Separate ID and data
df_id = df["id"]
df_data = df.drop(columns=["id"])

# --- Drop fully missing columns ---
all_nan_cols = df_data.columns[df_data.isnull().all()]
df_data = df_data.drop(columns=all_nan_cols)

# --- Store missingness pattern ---
missing_mask = df_data.isnull()

# --- Impute missing values ---
imputer = SimpleImputer(strategy="most_frequent")
df_imputed = pd.DataFrame(imputer.fit_transform(df_data), columns=df_data.columns)

# --- Categorical columns ---
categorical_columns = [
    "sex", "risk_status_mom", "income", "parent_ed", "pds_tanner_cat"
]

# --- Coerce types ---
for col in categorical_columns:
    if col in df_imputed.columns:
        df_imputed[col] = df_imputed[col].astype(int)

for col in df_imputed.columns:
    if col not in categorical_columns:
        df_imputed[col] = pd.to_numeric(df_imputed[col], errors='coerce')

# --- Metadata ---
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_imputed)

manual_dtypes = {col: "categorical" for col in categorical_columns}
for col in df_imputed.columns:
    if col not in categorical_columns:
        manual_dtypes[col] = "numerical"

for col, dtype in manual_dtypes.items():
    if col in metadata.columns:
        metadata.update_column(column_name=col, sdtype=dtype)

if os.path.exists("ctgan_metadata.json"):
    os.remove("ctgan_metadata.json")
metadata.save_to_json("ctgan_metadata.json")

# --- Fit model ---
synthesizer = CTGANSynthesizer(metadata, epochs=300)
synthesizer.fit(df_imputed)

# --- Generate data ---
n_original = df_data.shape[0]
n_to_generate = 350 - n_original
synthetic_new = synthesizer.sample(num_rows=n_to_generate)

# --- Clip known bounds ---
clip_bounds = {
    "v1_meal_total_kcal": (200, 3000),
    "v1_eah_total_kcal": (0, 2000),
    "age_yr": (2, 12),
    "bmi_percentile": (15, 95),
    "v7_bmi_percentile": (15, 95),
    "sex": (0, 1),
    "risk_status_mom": (0, 1),
    "income": (0, 5),
    "parent_ed": (0, 5),
    "pds_tanner_cat": (1, 2)
}
for col, (low, high) in clip_bounds.items():
    if col in synthetic_new.columns:
        synthetic_new[col] = pd.to_numeric(synthetic_new[col], errors='coerce')
        synthetic_new[col] = synthetic_new[col].clip(lower=low, upper=high)
        if col in categorical_columns:
            synthetic_new[col] = synthetic_new[col].round().astype(int)

# --- Enforce BMI-MVPA-Sedentary pattern ---
if set(["bmi_percentile", "v7_bmi_percentile", "mean_mvpa", "mean_sedentary"]).issubset(synthetic_new.columns):
    bmi_base = np.random.normal(50, 10, size=n_to_generate)
    mvpa_noise = np.random.normal(0, 5, size=n_to_generate)
    sed_noise = np.random.normal(0, 10, size=n_to_generate)

    synthetic_new["mean_mvpa"] = np.maximum(0, 90 - bmi_base + mvpa_noise)
    synthetic_new["mean_sedentary"] = np.maximum(100, 300 + bmi_base + sed_noise)
    synthetic_new["bmi_percentile"] = np.clip(bmi_base + np.random.normal(0, 3, size=n_to_generate), 15, 95)
    synthetic_new["v7_bmi_percentile"] = np.clip(synthetic_new["bmi_percentile"] + np.random.normal(0, 2, size=n_to_generate), 15, 95)

# --- Round fswitch and switch features to 0.25 increments ---
for col in synthetic_new.columns:
    if "switch" in col or "fswitch" in col:
        synthetic_new[col] = (synthetic_new[col] / 0.25).round() * 0.25

# --- Ensure bite/percent active eating slopes are non-negative ---
for col in synthetic_new.columns:
    if "bite" in col or "percent_active_eating" in col:
        if "intercept" not in col:
            synthetic_new[col] = synthetic_new[col].clip(lower=0)

# --- Reinject missingness ---
sampled_masks = resample(missing_mask.values, n_samples=n_to_generate, replace=True)
synthetic_with_missing = synthetic_new.mask(sampled_masks)

# --- Add noise and round to 2 decimals for numeric ---
numeric_cols = df_data.select_dtypes(include=['number']).columns
for col in numeric_cols:
    if col in synthetic_with_missing.columns:
        std = df_data[col].std()
        if pd.notna(std) and std > 0:
            synthetic_with_missing[col] += np.random.normal(0, 0.01 * std, size=n_to_generate)
        synthetic_with_missing[col] = synthetic_with_missing[col].round(2)

# --- Row validation ---
def is_valid_row(row):
    return 2 <= row.get("age_yr", 0) <= 12 and row.get("v1_meal_total_kcal", 0) <= 3500

synthetic_valid = synthetic_with_missing[synthetic_with_missing.apply(is_valid_row, axis=1)].reset_index(drop=True)

# --- Assign new IDs ---
max_real_id = df_id.max()
synthetic_valid.insert(0, "id", range(max_real_id + 1, max_real_id + 1 + len(synthetic_valid)))

# --- Restore any previously dropped columns ---
for col in all_nan_cols:
    df_data[col] = np.nan
    synthetic_valid[col] = np.nan

# --- Combine original and synthetic data ---
df_real_with_id = df_data.copy()
df_real_with_id.insert(0, "id", df_id.values)

df_combined = pd.concat([df_real_with_id, synthetic_valid], ignore_index=True)
df_combined = df_combined.sort_values("id").reset_index(drop=True)

# --- Save outputs ---
df_combined.to_csv("ML_child_obesity_syn_data.csv", index=False)
print("✅ Final dataset saved as 'ML_child_obesity_syn_data.csv'")
print("✅ Metadata saved as 'ctgan_metadata.json'")


The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.



✅ Final dataset saved as 'ML_child_obesity_syn_data.csv'
✅ Metadata saved as 'ctgan_metadata.json'
