In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [None]:
# 1. Load dataset
df = pd.read_csv("../src/data/parkinsons_updrs.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
# -------------------------
# Columns
# -------------------------
TARGET = "total_UPDRS"
AGE_COL = "age"
GENDER_COL = "sex"   # 0 = female, 1 = male



In [None]:
# Drop non-speech columns
NON_FEATURES = ["subject#", "age", "sex", "motor_UPDRS", "total_UPDRS"]
X = df.drop(columns=NON_FEATURES)
y = df[TARGET]



In [None]:
X.head()

In [None]:
X.info()

In [None]:
y.head()

In [None]:
# =========================
# 2. Feature scaling + PCA
# =========================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=13, random_state=42)
X_pca = pca.fit_transform(X_scaled)



In [None]:
# Display eigenvalues (explained variance) from PCA
eigenvalues = pca.explained_variance_
cumsum_variance = np.cumsum(pca.explained_variance_ratio_)

eigenvalue_df = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(len(eigenvalues))],
    'Eigenvalue': eigenvalues,
    'Explained Variance Ratio': pca.explained_variance_ratio_,
    'Cumulative Variance Ratio': cumsum_variance
})

print(eigenvalue_df)
print(f"\nTotal Explained Variance: {cumsum_variance[-1]:.4f}")

In [None]:
# Rebuild dataframe with metadata
pca_df = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(13)])
pca_df[AGE_COL] = df[AGE_COL].values
pca_df[GENDER_COL] = df[GENDER_COL].values
pca_df[TARGET] = y.values





In [None]:
# =========================
# 3. Age & Gender Partition
# =========================
def age_group(age):
    if age < 60:
        return "lt60"
    elif 60 <= age <= 70:
        return "60to70"
    else:
        return "gt70"

pca_df["age_group"] = pca_df[AGE_COL].apply(age_group)



In [None]:
# =========================
# Create group-wise datasets
# =========================

male_lt60 = pca_df[
    (pca_df[GENDER_COL] == 1) &
    (pca_df["age_group"] == "lt60")
]

male_60_70 = pca_df[
    (pca_df[GENDER_COL] == 1) &
    (pca_df["age_group"] == "60to70")
]

male_gt70 = pca_df[
    (pca_df[GENDER_COL] == 1) &
    (pca_df["age_group"] == "gt70")
]

female_lt60 = pca_df[
    (pca_df[GENDER_COL] == 0) &
    (pca_df["age_group"] == "lt60")
]

female_60_70 = pca_df[
    (pca_df[GENDER_COL] == 0) &
    (pca_df["age_group"] == "60to70")
]

female_gt70 = pca_df[
    (pca_df[GENDER_COL] == 0) &
    (pca_df["age_group"] == "gt70")
]


In [None]:
male_lt60.info()

In [None]:
male_60_70.info()

In [None]:
male_gt70.info()

In [None]:
female_lt60.info()

In [None]:
female_60_70.info()

In [None]:
female_gt70.info()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from xgboost import XGBRegressor
import numpy as np


def train_xgb_group(data, group_name):
    X = data[[f"PC{i+1}" for i in range(13)]]
    y = data[TARGET]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    model = XGBRegressor(
        n_estimators=710,
        learning_rate=0.01,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # ===== Metrics =====
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"\n{group_name}")
    print(f"MAE  : {mae:.3f}")
    print(f"RMSE : {rmse:.3f}")
    print(f"R²   : {r2:.3f}")

    return model, mae, rmse, r2


**Model for Male < 60**

In [None]:
model_male_lt60, mae_male_lt60, rmse_male_lt60, r2_male_lt60 = \
    train_xgb_group(male_lt60, "Male < 60")


**Model for Male between 60 to 70**

In [None]:
model_male_60_70, mae_male_60_70, rmse_male_60_70, r2_male_60_70 = \
    train_xgb_group(male_60_70, "Male 60–70")


**Model for Male > 70**

In [None]:
model_male_gt70, mae_male_gt70, rmse_male_gt70, r2_male_gt70 = \
    train_xgb_group(male_gt70, "Male > 70")


**Model for Female < 60**

In [None]:
model_female_lt60, mae_female_lt60, rmse_female_lt60, r2_female_lt60 = \
    train_xgb_group(female_lt60, "Female < 60")


**Model for Female between 60 to 70**

In [None]:
model_female_60_70, mae_female_60_70, rmse_female_60_70, r2_female_60_70 = \
    train_xgb_group(female_60_70, "Female 60–70")


**Model for Female > 70**

In [None]:
model_female_gt70, mae_female_gt70, rmse_female_gt70, r2_female_gt70 = \
    train_xgb_group(female_gt70, "Female > 70")


In [None]:
results_df = pd.DataFrame({
    "Group": [
        "Male < 60", "Male 60–70", "Male > 70",
        "Female < 60", "Female 60–70", "Female > 70"
    ],
    "MAE": [
        mae_male_lt60, mae_male_60_70, mae_male_gt70,
        mae_female_lt60, mae_female_60_70, mae_female_gt70
    ],
    "RMSE": [
        rmse_male_lt60, rmse_male_60_70, rmse_male_gt70,
        rmse_female_lt60, rmse_female_60_70, rmse_female_gt70
    ],
    "R2": [
        r2_male_lt60, r2_male_60_70, r2_male_gt70,
        r2_female_lt60, r2_female_60_70, r2_female_gt70
    ]
})

print(results_df)


**Model Without Splitting the data based on age and gender**

In [None]:
# PCA features
X_all = pca_df[[f"PC{i+1}" for i in range(13)]]

# Target
y_all = pca_df[TARGET]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_all,
    y_all,
    test_size=0.15,
    random_state=42
)


In [None]:
model_global = XGBRegressor(
    n_estimators=710,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

model_global.fit(X_train, y_train)


In [None]:
y_pred = model_global.predict(X_test)

mae_global = mean_absolute_error(y_test, y_pred)
rmse_global = np.sqrt(mean_squared_error(y_test, y_pred))
r2_global = r2_score(y_test, y_pred)

print("Global XGBoost Model Performance")
print(f"MAE  : {mae_global:.3f}")
print(f"RMSE : {rmse_global:.3f}")
print(f"R²   : {r2_global:.3f}")
