In [34]:
import pandas as pd

# Load the dataset
file_path = 'data.csv'
data = pd.read_csv(file_path)

# Displaying the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,subject#,age,sex,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,...,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,motor_UPDRS,total_UPDRS
0,1,72,0,5.6431,0.00662,3.4e-05,0.00401,0.00317,0.01204,0.02565,...,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006,28.199,34.398
1,1,72,0,12.666,0.003,1.7e-05,0.00132,0.0015,0.00395,0.02024,...,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081,28.447,34.894
2,1,72,0,19.681,0.00481,2.5e-05,0.00205,0.00208,0.00616,0.01675,...,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014,28.695,35.389
3,1,72,0,25.647,0.00528,2.7e-05,0.00191,0.00264,0.00573,0.02309,...,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277,28.905,35.81
4,1,72,0,33.642,0.00335,2e-05,0.00093,0.0013,0.00278,0.01703,...,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361,29.187,36.375


In [38]:
from sklearn.preprocessing import StandardScaler

# Selecting voice measure columns for standardization (excluding non-voice measure columns)
voice_measure_columns = data.columns[4:-2]  # Excludes subject#, age, sex, test_time, and target variables

# Standardizing the voice measures
scaler = StandardScaler()
data_standardized = data.copy()
data_standardized[voice_measure_columns] = scaler.fit_transform(data[voice_measure_columns])

# Display the standardized voice measure columns
data_standardized[voice_measure_columns].describe()

Unnamed: 0,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
count,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0
mean,1.451321e-16,1.741586e-16,1.935095e-17,-1.935095e-17,1.693208e-16,1.741586e-16,1.83834e-16,4.8377380000000006e-17,1.548076e-16,-3.87019e-17,9.675476000000001e-17,-3.87019e-17,-6.385814e-16,1.935095e-16,4.450719e-16,1.741586e-16
std,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085,1.000085
min,-0.946657,-1.161265,-0.850693,-0.7629815,-0.8517702,-1.199058,-1.237697,-1.174502,-1.092523,-1.250516,-1.174245,-0.5333439,-4.665987,-3.866744,-1.963436,-2.15985
25%,-0.4576594,-0.6012255,-0.4505077,-0.3904491,-0.4515863,-0.5773716,-0.5905295,-0.5950218,-0.5613909,-0.5912529,-0.5952688,-0.3545954,-0.529862,-0.7099418,-0.8048365,-0.6913181
50%,-0.2229405,-0.2649238,-0.2360084,-0.2108831,-0.2360206,-0.2525927,-0.2517438,-0.2610839,-0.2523139,-0.2387287,-0.2608275,-0.2290577,0.05605219,0.007695473,-0.135969,-0.1539957
75%,0.1149124,0.2575945,0.09694579,0.04908555,0.09693236,0.2212206,0.2347177,0.2583332,0.216703,0.2619106,0.2585891,-0.01100432,0.6442969,0.7186976,0.819445,0.4907692
max,16.68571,11.16072,17.46499,17.76448,17.46598,9.08122,7.800931,10.99383,8.814745,12.40861,10.99408,11.99822,3.774533,4.20498,2.99538,5.597736


In [42]:
# For further data preprocessing, we'll look at the data statistics to identify any anomalies or outliers
data_standardized.describe()

# We will also check the balance of the target variable to see if any resampling techniques are needed
targets_balance = data_standardized[['motor_UPDRS', 'total_UPDRS']].describe()

targets_balance

Unnamed: 0,motor_UPDRS,total_UPDRS
count,5875.0,5875.0
mean,21.296229,29.018942
std,8.129282,10.700283
min,5.0377,7.0
25%,15.0,21.371
50%,20.871,27.576
75%,27.5965,36.399
max,39.511,54.992


Linear Regression

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Selecting predictor variables, excluding identifiers and target variables
predictor_vars = data_standardized.columns.drop(['subject#', 'motor_UPDRS', 'total_UPDRS'])

# Splitting the dataset into training and testing sets for both target variables
X = data_standardized[predictor_vars]
y_motor_UPDRS = data_standardized['motor_UPDRS']
y_total_UPDRS = data_standardized['total_UPDRS']

# Splitting the data for motor_UPDRS
X_train_motor, X_test_motor, y_train_motor, y_test_motor = train_test_split(X, y_motor_UPDRS, test_size=0.3, random_state=42)

# Splitting the data for total_UPDRS
X_train_total, X_test_total, y_train_total, y_test_total = train_test_split(X, y_total_UPDRS, test_size=0.3, random_state=42)

# Linear Regression for motor_UPDRS
model_motor_UPDRS = LinearRegression()
model_motor_UPDRS.fit(X_train_motor, y_train_motor)
y_pred_motor_UPDRS = model_motor_UPDRS.predict(X_test_motor)

# Linear Regression for total_UPDRS
model_total_UPDRS = LinearRegression()
model_total_UPDRS.fit(X_train_total, y_train_total)
y_pred_total_UPDRS = model_total_UPDRS.predict(X_test_total)

# Evaluating the models
mse_motor_UPDRS = mean_squared_error(y_test_motor, y_pred_motor_UPDRS)
r2_motor_UPDRS = r2_score(y_test_motor, y_pred_motor_UPDRS)

mse_total_UPDRS = mean_squared_error(y_test_total, y_pred_total_UPDRS)
r2_total_UPDRS = r2_score(y_test_total, y_pred_total_UPDRS)

# Results for motor_UPDRS
mse_motor_UPDRS, r2_motor_UPDRS, mse_total_UPDRS, r2_total_UPDRS

(55.49565645486397,
 0.14849361407788375,
 92.22558174712326,
 0.18007328926902033)

Random Forests

In [52]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Regression for motor_UPDRS
rf_model_motor_UPDRS = RandomForestRegressor(random_state=42)
rf_model_motor_UPDRS.fit(X_train_motor, y_train_motor)
y_pred_rf_motor_UPDRS = rf_model_motor_UPDRS.predict(X_test_motor)

# Random Forest Regression for total_UPDRS
rf_model_total_UPDRS = RandomForestRegressor(random_state=42)
rf_model_total_UPDRS.fit(X_train_total, y_train_total)
y_pred_rf_total_UPDRS = rf_model_total_UPDRS.predict(X_test_total)

# Evaluating the Random Forest models
mse_rf_motor_UPDRS = mean_squared_error(y_test_motor, y_pred_rf_motor_UPDRS)
r2_rf_motor_UPDRS = r2_score(y_test_motor, y_pred_rf_motor_UPDRS)

mse_rf_total_UPDRS = mean_squared_error(y_test_total, y_pred_rf_total_UPDRS)
r2_rf_total_UPDRS = r2_score(y_test_total, y_pred_rf_total_UPDRS)

mse_rf_motor_UPDRS, r2_rf_motor_UPDRS, mse_rf_total_UPDRS, r2_rf_total_UPDRS

(1.9847780730480413, 0.9695462435837089, 2.702827064401372, 0.9759706573533388)

XgBOOST

In [10]:
!pip install xgboost



In [53]:
from xgboost import XGBRegressor

# XGBoost for motor_UPDRS
xgb_model_motor_UPDRS = XGBRegressor(random_state=42)
xgb_model_motor_UPDRS.fit(X_train_motor, y_train_motor)
y_pred_xgb_motor_UPDRS = xgb_model_motor_UPDRS.predict(X_test_motor)

# XGBoost for total_UPDRS
xgb_model_total_UPDRS = XGBRegressor(random_state=42)
xgb_model_total_UPDRS.fit(X_train_total, y_train_total)
y_pred_xgb_total_UPDRS = xgb_model_total_UPDRS.predict(X_test_total)

# Evaluating the XGBoost models
mse_xgb_motor_UPDRS = mean_squared_error(y_test_motor, y_pred_xgb_motor_UPDRS)
r2_xgb_motor_UPDRS = r2_score(y_test_motor, y_pred_xgb_motor_UPDRS)

mse_xgb_total_UPDRS = mean_squared_error(y_test_total, y_pred_xgb_total_UPDRS)
r2_xgb_total_UPDRS = r2_score(y_test_total, y_pred_xgb_total_UPDRS)

mse_xgb_motor_UPDRS, r2_xgb_motor_UPDRS, mse_xgb_total_UPDRS, r2_xgb_total_UPDRS


(3.2965058076263993, 0.9494195415328365, 4.298755838281754, 0.9617821360630463)

In [54]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Setting up the parameter grid for randomized search
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_child_weight': [1, 2, 3, 4],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

# Randomized search for motor_UPDRS
rand_search_motor = RandomizedSearchCV(XGBRegressor(random_state=42), param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2)
rand_search_motor.fit(X_train_motor, y_train_motor)

# Randomized search for total_UPDRS
rand_search_total = RandomizedSearchCV(XGBRegressor(random_state=42), param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2)
rand_search_total.fit(X_train_total, y_train_total)

# Best parameters and best score for motor_UPDRS
best_params_motor = rand_search_motor.best_params_
best_score_motor = rand_search_motor.best_score_

# Best parameters and best score for total_UPDRS
best_params_total = rand_search_total.best_params_
best_score_total = rand_search_total.best_score_

best_params_motor, best_score_motor, best_params_total, best_score_total

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits


({'subsample': 0.9,
  'n_estimators': 100,
  'min_child_weight': 1,
  'max_depth': 8,
  'learning_rate': 0.15,
  'gamma': 0,
  'colsample_bytree': 0.9},
 0.9527193717602334,
 {'subsample': 0.8,
  'n_estimators': 500,
  'min_child_weight': 1,
  'max_depth': 7,
  'learning_rate': 0.2,
  'gamma': 0.2,
  'colsample_bytree': 0.9},
 0.9549856155420756)

SVR

In [55]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# Create SVR models
svr_motor_UPDRS = SVR()
svr_total_UPDRS = SVR()

# Define the parameter grid for grid search
param_grid = {
    'C': [0.1, 1, 10, 20, 50],
    'epsilon': [0.01, 0.1, 0.5, 1],
    'gamma': ['scale', 'auto', 0.1, 1]
}

# Perform grid search for motor_UPDRS
grid_search_motor_UPDRS = GridSearchCV(svr_motor_UPDRS, param_grid, cv=3, n_jobs=-1)
grid_search_motor_UPDRS.fit(X_train_motor, y_train_motor)

# Perform grid search for total_UPDRS
grid_search_total_UPDRS = GridSearchCV(svr_total_UPDRS, param_grid, cv=3, n_jobs=-1)
grid_search_total_UPDRS.fit(X_train_total, y_train_total)

# Get the best hyperparameters and scores for both models
best_params_motor_UPDRS = grid_search_motor_UPDRS.best_params_
best_score_motor_UPDRS = grid_search_motor_UPDRS.best_score_

best_params_total_UPDRS = grid_search_total_UPDRS.best_params_
best_score_total_UPDRS = grid_search_total_UPDRS.best_score_

best_params_motor_UPDRS, best_score_motor_UPDRS, best_params_total_UPDRS, best_score_total_UPDRS


({'C': 50, 'epsilon': 0.01, 'gamma': 0.1},
 0.7462142755023985,
 {'C': 50, 'epsilon': 0.01, 'gamma': 0.1},
 0.759947307772148)

In [59]:
from sklearn.svm import SVR

# Create SVR models with the best hyperparameters
svr_motor_UPDRS_best = SVR(C=50, epsilon=0.01, gamma=0.1)
svr_total_UPDRS_best = SVR(C=50, epsilon=0.01, gamma=0.1)

# Train the SVR models
svr_motor_UPDRS_best.fit(X_train_motor, y_train_motor)
svr_total_UPDRS_best.fit(X_train_total, y_train_total)

# Predictions for motor_UPDRS and total_UPDRS
y_pred_motor_UPDRS_svr = svr_motor_UPDRS_best.predict(X_test_motor)
y_pred_total_UPDRS_svr = svr_total_UPDRS_best.predict(X_test_total)

# Calculate R-squared for both models
from sklearn.metrics import r2_score

r2_motor_UPDRS_svr = r2_score(y_test_motor, y_pred_motor_UPDRS_svr)
r2_total_UPDRS_svr = r2_score(y_test_total, y_pred_total_UPDRS_svr)

r2_motor_UPDRS_svr, r2_total_UPDRS_svr


(0.8315844261327189, 0.8525354444859341)