In [1]:
import pandas as pd

# Read the csv files
x_train_scale = pd.read_csv('x_train_scale.csv')
y_train_scale = pd.read_csv('y_train_scale.csv')

In [2]:
# Show the first 5 rows of the dataframes
print("x_train_scale:")
print(x_train_scale.head())
print("\ny_train_scale:")
print(y_train_scale.head())

x_train_scale:
   subject#       age  sex  test_time  Jitter(%)  Jitter(Abs)  Jitter:RAP  \
0         6 -0.198540    0  -1.638719   0.619703     0.740882    0.805863   
1        40  2.290131    1   0.655123  -0.298646    -0.402497   -0.371813   
2        42 -0.424783    0   1.200335  -0.401872    -0.334106   -0.355812   
3        20  0.253946    0  -1.517513  -0.341360    -0.450510   -0.269406   
4         4  1.045796    0   0.280279   0.011030    -0.013089    0.060215   

   Jitter:PPQ5  Jitter:DDP   Shimmer  Shimmer(dB)  Shimmer:APQ3  Shimmer:APQ5  \
0     0.477598    0.805846  0.386256     0.327007      0.627476      0.221878   
1    -0.324201   -0.370755  0.008842    -0.054711      0.000760      0.096563   
2    -0.302748   -0.356888 -0.303740    -0.384378     -0.273194     -0.258995   
3    -0.283977   -0.270483 -0.366642    -0.349676     -0.345998     -0.333344   
4    -0.026543    0.061270  1.407744     1.424449      1.646735      1.797604   

   Shimmer:APQ11  Shimmer:DDA      

In [3]:
# pip install statsmodels

In [7]:
# Do simple linear regression (OLS):
import statsmodels.api as sm

# Drop column "subject#" and "test_time" from the independent variable matrix
X = x_train_scale.drop(['subject#', 'test_time'], axis=1)

# Add a constant to the independent variable matrix
X_with_const = sm.add_constant(X)

# Use "motor_UPDRS" as the dependent variable
y = y_train_scale['motor_UPDRS']

# Fit the model
ols_model_motor = sm.OLS(y, X_with_const).fit()

# Print the summary
print(ols_model_motor.summary())

                            OLS Regression Results                            
Dep. Variable:            motor_UPDRS   R-squared:                       0.149
Model:                            OLS   Adj. R-squared:                  0.146
Method:                 Least Squares   F-statistic:                     45.64
Date:                Tue, 09 Apr 2024   Prob (F-statistic):          2.46e-149
Time:                        16:38:12   Log-Likelihood:                -6289.0
No. Observations:                4700   AIC:                         1.262e+04
Df Residuals:                    4681   BIC:                         1.274e+04
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.0504      0.017      2.940

Test results for normality and autocorrelation are shown above (Jarque-Bera test and Durbin-Watson test).

Test for heteroskedasticity:

In [8]:
# White test
import statsmodels.stats.diagnostic as sm_diag

_, pval, _, f_pval = sm_diag.het_white(ols_model_motor.resid, ols_model_motor.model.exog)
print("White test p-value:", pval)
print("White test F-statistic p-value:", f_pval)


White test p-value: 2.7912638602429617e-116
White test F-statistic p-value: 8.286013025490796e-136


In [9]:
# Run Breusch-Pagan test
_, pval, _, _ = sm_diag.het_breuschpagan(ols_model_motor.resid, ols_model_motor.model.exog)
print("Breusch-Pagan test p-value:", pval)


Breusch-Pagan test p-value: 5.1236541887322196e-33


In [10]:
# Use "total_UPDRS" as the dependent variable
y = y_train_scale['total_UPDRS']

# Fit the model
ols_model_total = sm.OLS(y, X_with_const).fit()

# Print the summary
print(ols_model_total.summary())

                            OLS Regression Results                            
Dep. Variable:            total_UPDRS   R-squared:                       0.172
Model:                            OLS   Adj. R-squared:                  0.169
Method:                 Least Squares   F-statistic:                     54.11
Date:                Tue, 09 Apr 2024   Prob (F-statistic):          1.28e-176
Time:                        16:45:43   Log-Likelihood:                -6224.8
No. Observations:                4700   AIC:                         1.249e+04
Df Residuals:                    4681   BIC:                         1.261e+04
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.0872      0.017      5.157

Test results for normality and autocorrelation are shown above (Jarque-Bera test and Durbin-Watson test).

Test for heteroskedasticity:

In [11]:
# White test
import statsmodels.stats.diagnostic as sm_diag

_, pval, _, f_pval = sm_diag.het_white(ols_model_total.resid, ols_model_total.model.exog)
print("White test p-value:", pval)
print("White test F-statistic p-value:", f_pval)


White test p-value: 6.910923555266251e-153
White test F-statistic p-value: 5.315329492315364e-184


In [12]:
# Run Breusch-Pagan test
_, pval, _, _ = sm_diag.het_breuschpagan(ols_model_motor.resid, ols_model_motor.model.exog)
print("Breusch-Pagan test p-value:", pval)


Breusch-Pagan test p-value: 5.1236541887322196e-33
