In [130]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor


In [131]:
# 1. Load dataset

df = pd.read_csv("parkinsons_updrs.csv")


In [132]:
df.head()

Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,...,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,1,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,...,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,1,72,0,19.681,28.695,35.389,0.00481,2.5e-05,0.00205,0.00208,...,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.647,28.905,35.81,0.00528,2.7e-05,0.00191,0.00264,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,1,72,0,33.642,29.187,36.375,0.00335,2e-05,0.00093,0.0013,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


In [133]:
df.describe()

Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,...,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
count,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,...,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0
mean,21.494128,64.804936,0.317787,92.863722,21.296229,29.018942,0.006154,4.4e-05,0.002987,0.003277,...,0.31096,0.017156,0.020144,0.027481,0.051467,0.03212,21.679495,0.541473,0.65324,0.219589
std,12.372279,8.821524,0.465656,53.445602,8.129282,10.700283,0.005624,3.6e-05,0.003124,0.003732,...,0.230254,0.013237,0.016664,0.019986,0.039711,0.059692,4.291096,0.100986,0.070902,0.091498
min,1.0,36.0,0.0,-4.2625,5.0377,7.0,0.00083,2e-06,0.00033,0.00043,...,0.026,0.00161,0.00194,0.00249,0.00484,0.000286,1.659,0.15102,0.51404,0.021983
25%,10.0,58.0,0.0,46.8475,15.0,21.371,0.00358,2.2e-05,0.00158,0.00182,...,0.175,0.00928,0.01079,0.015665,0.02783,0.010955,19.406,0.469785,0.59618,0.15634
50%,22.0,65.0,0.0,91.523,20.871,27.576,0.0049,3.5e-05,0.00225,0.00249,...,0.253,0.0137,0.01594,0.02271,0.04111,0.018448,21.92,0.54225,0.6436,0.2055
75%,33.0,72.0,1.0,138.445,27.5965,36.399,0.0068,5.3e-05,0.00329,0.00346,...,0.365,0.020575,0.023755,0.032715,0.061735,0.031463,24.444,0.614045,0.711335,0.26449
max,42.0,85.0,1.0,215.49,39.511,54.992,0.09999,0.000446,0.05754,0.06956,...,2.107,0.16267,0.16702,0.27546,0.48802,0.74826,37.875,0.96608,0.8656,0.73173


In [134]:
df.isna().sum()

Unnamed: 0,0
subject#,0
age,0
sex,0
test_time,0
motor_UPDRS,0
total_UPDRS,0
Jitter(%),0
Jitter(Abs),0
Jitter:RAP,0
Jitter:PPQ5,0


In [135]:
# -------------------------
# Columns
# -------------------------
TARGET = "total_UPDRS"
AGE_COL = "age"
GENDER_COL = "sex"   # 0 = female, 1 = male



In [136]:
# Drop non-speech columns
NON_FEATURES = ["subject#", "age", "sex", "motor_UPDRS", "total_UPDRS"]
X = df.drop(columns=NON_FEATURES)
y = df[TARGET]



In [137]:
X.head()

Unnamed: 0,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,5.6431,0.00662,3.4e-05,0.00401,0.00317,0.01204,0.02565,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,12.666,0.003,1.7e-05,0.00132,0.0015,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,19.681,0.00481,2.5e-05,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,25.647,0.00528,2.7e-05,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,33.642,0.00335,2e-05,0.00093,0.0013,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


In [138]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5875 entries, 0 to 5874
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   test_time      5875 non-null   float64
 1   Jitter(%)      5875 non-null   float64
 2   Jitter(Abs)    5875 non-null   float64
 3   Jitter:RAP     5875 non-null   float64
 4   Jitter:PPQ5    5875 non-null   float64
 5   Jitter:DDP     5875 non-null   float64
 6   Shimmer        5875 non-null   float64
 7   Shimmer(dB)    5875 non-null   float64
 8   Shimmer:APQ3   5875 non-null   float64
 9   Shimmer:APQ5   5875 non-null   float64
 10  Shimmer:APQ11  5875 non-null   float64
 11  Shimmer:DDA    5875 non-null   float64
 12  NHR            5875 non-null   float64
 13  HNR            5875 non-null   float64
 14  RPDE           5875 non-null   float64
 15  DFA            5875 non-null   float64
 16  PPE            5875 non-null   float64
dtypes: float64(17)
memory usage: 780.4 KB


In [139]:
y.head()

Unnamed: 0,total_UPDRS
0,34.398
1,34.894
2,35.389
3,35.81
4,36.375


In [140]:
# =========================
# 2. Feature scaling + PCA
# =========================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=13, random_state=42)
X_pca = pca.fit_transform(X_scaled)



In [141]:
# Rebuild dataframe with metadata
pca_df = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(13)])
pca_df[AGE_COL] = df[AGE_COL].values
pca_df[GENDER_COL] = df[GENDER_COL].values
pca_df[TARGET] = y.values





In [142]:
# =========================
# 3. Age & Gender Partition
# =========================
def age_group(age):
    if age < 60:
        return "lt60"
    elif 60 <= age <= 70:
        return "60to70"
    else:
        return "gt70"

pca_df["age_group"] = pca_df[AGE_COL].apply(age_group)



In [143]:
# =========================
# Create group-wise datasets
# =========================

male_lt60 = pca_df[
    (pca_df[GENDER_COL] == 1) &
    (pca_df["age_group"] == "lt60")
]

male_60_70 = pca_df[
    (pca_df[GENDER_COL] == 1) &
    (pca_df["age_group"] == "60to70")
]

male_gt70 = pca_df[
    (pca_df[GENDER_COL] == 1) &
    (pca_df["age_group"] == "gt70")
]

female_lt60 = pca_df[
    (pca_df[GENDER_COL] == 0) &
    (pca_df["age_group"] == "lt60")
]

female_60_70 = pca_df[
    (pca_df[GENDER_COL] == 0) &
    (pca_df["age_group"] == "60to70")
]

female_gt70 = pca_df[
    (pca_df[GENDER_COL] == 0) &
    (pca_df["age_group"] == "gt70")
]


In [144]:
male_lt60.info()

<class 'pandas.core.frame.DataFrame'>
Index: 756 entries, 1855 to 5125
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PC1          756 non-null    float64
 1   PC2          756 non-null    float64
 2   PC3          756 non-null    float64
 3   PC4          756 non-null    float64
 4   PC5          756 non-null    float64
 5   PC6          756 non-null    float64
 6   PC7          756 non-null    float64
 7   PC8          756 non-null    float64
 8   PC9          756 non-null    float64
 9   PC10         756 non-null    float64
 10  PC11         756 non-null    float64
 11  PC12         756 non-null    float64
 12  PC13         756 non-null    float64
 13  age          756 non-null    int64  
 14  sex          756 non-null    int64  
 15  total_UPDRS  756 non-null    float64
 16  age_group    756 non-null    object 
dtypes: float64(14), int64(2), object(1)
memory usage: 106.3+ KB


In [145]:
male_60_70.info()

<class 'pandas.core.frame.DataFrame'>
Index: 573 entries, 2272 to 5724
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PC1          573 non-null    float64
 1   PC2          573 non-null    float64
 2   PC3          573 non-null    float64
 3   PC4          573 non-null    float64
 4   PC5          573 non-null    float64
 5   PC6          573 non-null    float64
 6   PC7          573 non-null    float64
 7   PC8          573 non-null    float64
 8   PC9          573 non-null    float64
 9   PC10         573 non-null    float64
 10  PC11         573 non-null    float64
 11  PC12         573 non-null    float64
 12  PC13         573 non-null    float64
 13  age          573 non-null    int64  
 14  sex          573 non-null    int64  
 15  total_UPDRS  573 non-null    float64
 16  age_group    573 non-null    object 
dtypes: float64(14), int64(2), object(1)
memory usage: 80.6+ KB


In [146]:
male_gt70.info()

<class 'pandas.core.frame.DataFrame'>
Index: 538 entries, 1048 to 5559
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PC1          538 non-null    float64
 1   PC2          538 non-null    float64
 2   PC3          538 non-null    float64
 3   PC4          538 non-null    float64
 4   PC5          538 non-null    float64
 5   PC6          538 non-null    float64
 6   PC7          538 non-null    float64
 7   PC8          538 non-null    float64
 8   PC9          538 non-null    float64
 9   PC10         538 non-null    float64
 10  PC11         538 non-null    float64
 11  PC12         538 non-null    float64
 12  PC13         538 non-null    float64
 13  age          538 non-null    int64  
 14  sex          538 non-null    int64  
 15  total_UPDRS  538 non-null    float64
 16  age_group    538 non-null    object 
dtypes: float64(14), int64(2), object(1)
memory usage: 75.7+ KB


In [147]:
female_lt60.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1121 entries, 149 to 4691
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PC1          1121 non-null   float64
 1   PC2          1121 non-null   float64
 2   PC3          1121 non-null   float64
 3   PC4          1121 non-null   float64
 4   PC5          1121 non-null   float64
 5   PC6          1121 non-null   float64
 6   PC7          1121 non-null   float64
 7   PC8          1121 non-null   float64
 8   PC9          1121 non-null   float64
 9   PC10         1121 non-null   float64
 10  PC11         1121 non-null   float64
 11  PC12         1121 non-null   float64
 12  PC13         1121 non-null   float64
 13  age          1121 non-null   int64  
 14  sex          1121 non-null   int64  
 15  total_UPDRS  1121 non-null   float64
 16  age_group    1121 non-null   object 
dtypes: float64(14), int64(2), object(1)
memory usage: 157.6+ KB


In [148]:
female_60_70.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1554 entries, 731 to 5874
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PC1          1554 non-null   float64
 1   PC2          1554 non-null   float64
 2   PC3          1554 non-null   float64
 3   PC4          1554 non-null   float64
 4   PC5          1554 non-null   float64
 5   PC6          1554 non-null   float64
 6   PC7          1554 non-null   float64
 7   PC8          1554 non-null   float64
 8   PC9          1554 non-null   float64
 9   PC10         1554 non-null   float64
 10  PC11         1554 non-null   float64
 11  PC12         1554 non-null   float64
 12  PC13         1554 non-null   float64
 13  age          1554 non-null   int64  
 14  sex          1554 non-null   int64  
 15  total_UPDRS  1554 non-null   float64
 16  age_group    1554 non-null   object 
dtypes: float64(14), int64(2), object(1)
memory usage: 218.5+ KB


In [149]:
female_gt70.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1333 entries, 0 to 4856
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PC1          1333 non-null   float64
 1   PC2          1333 non-null   float64
 2   PC3          1333 non-null   float64
 3   PC4          1333 non-null   float64
 4   PC5          1333 non-null   float64
 5   PC6          1333 non-null   float64
 6   PC7          1333 non-null   float64
 7   PC8          1333 non-null   float64
 8   PC9          1333 non-null   float64
 9   PC10         1333 non-null   float64
 10  PC11         1333 non-null   float64
 11  PC12         1333 non-null   float64
 12  PC13         1333 non-null   float64
 13  age          1333 non-null   int64  
 14  sex          1333 non-null   int64  
 15  total_UPDRS  1333 non-null   float64
 16  age_group    1333 non-null   object 
dtypes: float64(14), int64(2), object(1)
memory usage: 187.5+ KB


In [150]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from xgboost import XGBRegressor
import numpy as np


def train_xgb_group(data, group_name):
    X = data[[f"PC{i+1}" for i in range(13)]]
    y = data[TARGET]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42
    )

    model = XGBRegressor(
        n_estimators=710,
        learning_rate=0.01,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # ===== Metrics =====
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"\n{group_name}")
    print(f"MAE  : {mae:.3f}")
    print(f"RMSE : {rmse:.3f}")
    print(f"R²   : {r2:.3f}")

    return model, mae, rmse, r2


**Model for Male < 60**

In [151]:
model_male_lt60, mae_male_lt60, rmse_male_lt60, r2_male_lt60 = \
    train_xgb_group(male_lt60, "Male < 60")



Male < 60
MAE  : 4.293
RMSE : 6.236
R²   : 0.686


**Model for Male between 60 to 70**

In [152]:
model_male_60_70, mae_male_60_70, rmse_male_60_70, r2_male_60_70 = \
    train_xgb_group(male_60_70, "Male 60–70")



Male 60–70
MAE  : 1.358
RMSE : 2.265
R²   : 0.860


**Model for Male > 70**

In [153]:
model_male_gt70, mae_male_gt70, rmse_male_gt70, r2_male_gt70 = \
    train_xgb_group(male_gt70, "Male > 70")



Male > 70
MAE  : 1.867
RMSE : 2.551
R²   : 0.646


**Model for Female < 60**

In [154]:
model_female_lt60, mae_female_lt60, rmse_female_lt60, r2_female_lt60 = \
    train_xgb_group(female_lt60, "Female < 60")



Female < 60
MAE  : 2.689
RMSE : 3.558
R²   : 0.733


**Model for Female between 60 to 70**

In [155]:
model_female_60_70, mae_female_60_70, rmse_female_60_70, r2_female_60_70 = \
    train_xgb_group(female_60_70, "Female 60–70")



Female 60–70
MAE  : 5.147
RMSE : 7.433
R²   : 0.439


**Model for Female > 70**

In [156]:
model_female_gt70, mae_female_gt70, rmse_female_gt70, r2_female_gt70 = \
    train_xgb_group(female_gt70, "Female > 70")



Female > 70
MAE  : 4.594
RMSE : 6.139
R²   : 0.666


In [157]:
results_df = pd.DataFrame({
    "Group": [
        "Male < 60", "Male 60–70", "Male > 70",
        "Female < 60", "Female 60–70", "Female > 70"
    ],
    "MAE": [
        mae_male_lt60, mae_male_60_70, mae_male_gt70,
        mae_female_lt60, mae_female_60_70, mae_female_gt70
    ],
    "RMSE": [
        rmse_male_lt60, rmse_male_60_70, rmse_male_gt70,
        rmse_female_lt60, rmse_female_60_70, rmse_female_gt70
    ],
    "R2": [
        r2_male_lt60, r2_male_60_70, r2_male_gt70,
        r2_female_lt60, r2_female_60_70, r2_female_gt70
    ]
})

print(results_df)


          Group       MAE      RMSE        R2
0     Male < 60  4.292656  6.235728  0.686151
1    Male 60–70  1.357523  2.265032  0.859603
2     Male > 70  1.867003  2.550542  0.645542
3   Female < 60  2.688653  3.558392  0.733216
4  Female 60–70  5.147219  7.433225  0.439491
5   Female > 70  4.593656  6.138987  0.665548


**Model Without Splitting the data based on age and gender**

In [158]:
# PCA features
X_all = pca_df[[f"PC{i+1}" for i in range(13)]]

# Target
y_all = pca_df[TARGET]


In [159]:
X_train, X_test, y_train, y_test = train_test_split(
    X_all,
    y_all,
    test_size=0.15,
    random_state=42
)


In [160]:
model_global = XGBRegressor(
    n_estimators=710,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

model_global.fit(X_train, y_train)


In [161]:
y_pred = model_global.predict(X_test)

mae_global = mean_absolute_error(y_test, y_pred)
rmse_global = np.sqrt(mean_squared_error(y_test, y_pred))
r2_global = r2_score(y_test, y_pred)

print("Global XGBoost Model Performance")
print(f"MAE  : {mae_global:.3f}")
print(f"RMSE : {rmse_global:.3f}")
print(f"R²   : {r2_global:.3f}")


Global XGBoost Model Performance
MAE  : 6.152
RMSE : 8.067
R²   : 0.402
