In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [2]:
df = pd.read_csv(r'C:\Users\Cameron\OneDrive\Documents\DCU_Data\MWBW full dataset 2377 participants_anonymised (1).csv')

#drop all nan values for the whole dataset
drop_set = df[['new_calc_bmi','VO2max','age','puntkick_total','handpass_total','groundstrike_total','balance_catch_total','run_total','skip_total','gallop_total','slide_total','hop_total','hj_total','vj_total','mnm_total','catch_total','throw_total','roll_total','kick_total','2hs_total','1hs_total','bounce_total','balance_total']].copy()
drop_set = drop_set.dropna()
drop_set = drop_set.reset_index(drop=True)

drop_set

Unnamed: 0,new_calc_bmi,VO2max,age,puntkick_total,handpass_total,groundstrike_total,balance_catch_total,run_total,skip_total,gallop_total,...,vj_total,mnm_total,catch_total,throw_total,roll_total,kick_total,2hs_total,1hs_total,bounce_total,balance_total
0,20.087236,46.29,10.0,10.0,10.0,10.0,7.0,7.0,4.0,2.0,...,12.0,9.0,6.0,7.0,6.0,6.0,6.0,8.0,5.0,8.0
1,15.996408,61.86,9.0,10.0,6.0,7.0,5.0,8.0,6.0,4.0,...,11.0,9.0,6.0,4.0,5.0,6.0,7.0,8.0,6.0,8.0
2,15.254607,54.93,9.0,2.0,4.0,1.0,5.0,8.0,6.0,2.0,...,12.0,8.0,6.0,3.0,7.0,7.0,6.0,1.0,5.0,8.0
3,20.698502,45.69,10.0,6.0,6.0,8.0,4.0,8.0,4.0,2.0,...,10.0,6.0,6.0,4.0,5.0,3.0,5.0,8.0,0.0,8.0
4,15.472683,59.55,10.0,10.0,10.0,10.0,8.0,8.0,6.0,2.0,...,10.0,9.0,6.0,5.0,4.0,8.0,10.0,8.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,14.367644,61.82,12.0,10.0,10.0,0.0,0.0,8.0,5.0,6.0,...,12.0,12.0,6.0,7.0,7.0,8.0,10.0,7.0,4.0,8.0
909,22.031726,53.02,12.0,8.0,10.0,0.0,0.0,8.0,6.0,5.0,...,10.0,6.0,6.0,4.0,5.0,4.0,5.0,4.0,1.0,8.0
910,17.777778,51.62,13.0,8.0,10.0,0.0,0.0,8.0,5.0,6.0,...,12.0,8.0,6.0,4.0,7.0,2.0,5.0,3.0,5.0,8.0
911,21.113276,55.56,12.0,10.0,10.0,0.0,0.0,8.0,6.0,6.0,...,12.0,12.0,6.0,7.0,8.0,8.0,7.0,8.0,6.0,8.0


In [31]:
#get all bmi values between 10 and 40
a = drop_set['new_calc_bmi'].copy()
a = a.astype('float')
a = a[(a >= 10) & (a <= 40)]
#a.drop(index=a.index[509],axis=0,inplace=True)

a

0      20.087236
1      15.996408
2      15.254607
3      20.698502
4      15.472683
         ...    
908    14.367644
909    22.031726
910    17.777778
911    21.113276
912    17.849103
Name: new_calc_bmi, Length: 912, dtype: float64

In [15]:
#create the dependent variable using VO2max
y_var = zscore(drop_set['VO2max'].copy())
y_var = y_var.astype('float')
y_var.drop(index=y_var.index[509], axis=0, inplace=True)
y_var = y_var.reset_index(drop=True)

y_var

(912,)

In [24]:
#concat age to bmi
x_var = drop_set['age'].copy()
x_var = pd.concat([a,x_var], axis=1)
x_var = x_var.astype('float')
x_var = zscore(x_var,nan_policy='omit')
x_var = x_var.dropna().reset_index(drop=True)

x_var.shape

(912, 2)

In [23]:
#create categorical dataset and one-hot encode them
x_var2_list = ['puntkick_total','handpass_total','groundstrike_total','balance_catch_total','run_total','skip_total','gallop_total','slide_total','hop_total','hj_total','vj_total','mnm_total','catch_total','throw_total','roll_total','kick_total','2hs_total','1hs_total','bounce_total','balance_total']
x_var2 = drop_set[x_var2_list].copy()

x = pd.get_dummies(x_var2, columns=x_var2_list)

x_var = pd.concat([x_var,x], axis=1)
#x_var = x_var.dropna()
#x_var = x_var.reset_index(drop=True)

(913, 195)

In [14]:
column_names = x_var.columns

X_t, X_remain, y_t, y_remain = train_test_split(x_var, y_var, test_size=0.3)
X_test, X_val, y_test, y_val = train_test_split(X_remain, y_remain, test_size=0.5)

X_train = np.concatenate((X_t, X_val))
y_train = np.concatenate((y_t, y_val))

#x_train = pd.DataFrame(X_train, columns=column_names)
#y_train = pd.DataFrame(y_train, columns=['VO2max'])

model = LinearRegression().fit(X_train,y_train)

[ 2.19617837  1.99076295 -1.02935303  0.09788531  0.23539481  0.36271842
  1.17249656 -1.15667664  1.12666006 -0.94107533 -0.59815041 -1.57429807
 -0.1567619   1.34565667  0.51550674 -0.04301948  0.51550674 -0.94107533
 -1.46055565 -0.54891862 -0.85279763  1.55786268  1.77176634  3.02632829
 -1.89175826  0.93482583  0.76845631 -1.15667664 -1.02935303  1.55786268
 -1.25853552 -0.54891862 -1.15667664 -0.04301948 -0.44705973 -0.04301948
 -0.1567619   2.60700921 -0.85279763  1.17249656  1.17249656 -0.54891862
 -0.85279763  0.76845631  1.12666006 -0.44705973  1.79044047  0.62755152
 -0.7390552   0.51550674  0.69545744 -0.04301948  0.51550674  1.55786268
  1.35244726  2.58833508 -0.31973612 -1.15667664  2.18938777 -1.15667664
 -1.02935303  2.19617837 -0.16694779 -0.04301948  1.01970823 -0.54891862
  0.23539481 -0.1567619  -0.7390552   0.51550674  0.26425483 -1.02935303
 -0.7390552   0.62755152 -0.85279763 -0.54891862 -1.15667664 -0.04301948
  0.45778671  1.17249656  1.80402165  0.09788531 -1

In [8]:
y_hat = model.predict(X_test)

#y_hat_data = pd.DataFrame(y_hat,columns=['predicted_VO2max'])

#linear_result = pd.concat([y_hat_data, y_test], axis=1)

mean_absolute = mean_absolute_error(y_test, y_hat)

print('Linear Regression\n'
      '-------------------------')

print(f'The mean absolute error is: {mean_absolute}')

r2 = r2_score(y_test, y_hat)

print(f'The R2 score for the model is: {r2}')

#linear_result

Linear Regression
-------------------------
The mean absolute error is: 119331648.2503783
The R2 score for the model is: -7.249694433748205e+17




In [9]:
column_names = x_var.columns

X_t, X_remain, y_t, y_remain = train_test_split(x_var, y_var, test_size=0.3)
X_test, X_val, y_test, y_val = train_test_split(X_remain, y_remain, test_size=0.5)

X_train = np.concatenate((X_t, X_val))
y_train = np.concatenate((y_t, y_val))

x_train = pd.DataFrame(X_train, columns=column_names)
y_train = pd.DataFrame(y_train, columns=['VO2max'])

In [10]:
numpy_y_train = y_train.to_numpy(dtype=float, copy=True)

numpy_y_train = np.ravel(numpy_y_train)

model = SVR(kernel='rbf').fit(X_train, numpy_y_train)

y_test = y_test.reset_index(drop=True)

y_hat = model.predict(X_test)

y_hat_data = pd.DataFrame(y_hat, columns = ['predicted_VO2max'])

support_vector_regression = pd.concat([y_hat_data, y_test], axis=1)

mean_absolute = mean_absolute_error(y_test, y_hat)

print('Support Vector Regression\n'
      '-------------------------')

print(f'The mean absolute error is: {mean_absolute}')

r2 = r2_score(y_test, y_hat)

print(f'The R2 score for the model is: {r2}')

support_vector_regression

Support Vector Regression
-------------------------
The mean absolute error is: 0.7291409357603712
The R2 score for the model is: 0.2869034585536301




Unnamed: 0,predicted_VO2max,VO2max
0,-0.244564,0.264255
1,0.423571,-0.739055
2,0.255626,-0.941075
3,-0.241683,0.768456
4,0.285095,2.196178
...,...,...
132,0.466917,1.411865
133,0.056243,-2.068314
134,-0.323249,-0.043019
135,-0.447261,1.019708


In [11]:
column_names = x_var.columns

X_t, X_remain, y_t, y_remain = train_test_split(x_var, y_var, test_size=0.3)
X_test, X_val, y_test, y_val = train_test_split(X_remain, y_remain, test_size=0.5)

X_train = np.concatenate((X_t, X_val))
y_train = np.concatenate((y_t, y_val))

x_train = pd.DataFrame(X_train, columns=column_names)
y_train = pd.DataFrame(y_train, columns=['VO2max'])

In [12]:
numpy_y_train = y_train.to_numpy(dtype=float, copy=True)

numpy_y_train = np.ravel(numpy_y_train)

model = GradientBoostingRegressor().fit(X_train, numpy_y_train)

y_test = y_test.reset_index(drop=True)

y_hat_data = pd.DataFrame(y_hat, columns = ['predicted_VO2max'])

gradient_regressor_results = pd.concat([y_hat_data, y_test], axis=1)

mean_absolute = mean_absolute_error(y_test, y_hat)

print('Gradient Boosting Regressor\n'
      '---------------------------')

print(f'The mean absolute error is: {mean_absolute}')

r2 = r2_score(y_test, y_hat)

print(f'The R2 score for the model is: {r2}')

gradient_regressor_results

Gradient Boosting Regressor
---------------------------
The mean absolute error is: 0.9296589488225304
The R2 score for the model is: -0.36817561368124796


Unnamed: 0,predicted_VO2max,VO2max
0,-0.244564,-0.739055
1,0.423571,0.235395
2,0.255626,1.990763
3,-0.241683,0.235395
4,0.285095,-1.891758
...,...,...
132,0.466917,1.804022
133,0.056243,0.235395
134,-0.323249,1.557863
135,-0.447261,-1.156677
