In [1]:
import pandas as pd

cycling = pd.read_csv('./datasets/bike_rides.csv', index_col=0, parse_dates=True)
cycling.index.name = ''
target_name = 'power'
data, target = cycling.drop(columns=target_name), cycling[target_name]
data

Unnamed: 0,heart-rate,cadence,speed,acceleration,slope
,,,,,
2020-08-18 14:43:19,102.0,64.0,4.325,0.0880,-0.033870
2020-08-18 14:43:20,103.0,64.0,4.336,0.0842,-0.033571
2020-08-18 14:43:21,105.0,66.0,4.409,0.0234,-0.033223
2020-08-18 14:43:22,106.0,66.0,4.445,0.0016,-0.032908
2020-08-18 14:43:23,106.0,67.0,4.441,0.1144,0.000000
...,...,...,...,...,...
2020-09-13 14:55:57,130.0,0.0,1.054,0.0234,0.000000
2020-09-13 14:55:58,130.0,0.0,0.829,0.0258,0.000000
2020-09-13 14:55:59,129.0,0.0,0.616,-0.1686,0.000000


In [9]:
import numpy as np

In [36]:
new_data = {
    'speed^3': data['speed']**3,
    'speed': data['speed'],
    'sin(alpha)*speed' : np.sin(np.arctan(data['slope']))*data['speed'],
    'acc*speed': data['acceleration'].clip(lower=0)*data['speed']
}
new_data = pd.DataFrame(new_data)

In [42]:
new_data['sin(alpha)*speed'].mean()

-0.002709709612566326

In [106]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=4) 
scaler = StandardScaler()
model = RidgeCV()

scaler.fit(new_data)
scaled_new_data = scaler.transform(new_data)

In [107]:
from sklearn.model_selection import ShuffleSplit, cross_validate

cv = ShuffleSplit(n_splits=4)
scores_linear = cross_validate(
    model, scaled_new_data, y=target,
    cv=cv, scoring='neg_mean_absolute_error',
    return_estimator=True,
    return_train_score=True,
)

In [108]:
print(
    f"Mean absolute error on the test sets: "
    f"{-scores_linear['test_score'].mean():.3f}"
)

Mean absolute error on the test sets: 72.203


In [109]:
scores_linear['estimator'][1].fit(new_data, target)

In [110]:
scores_linear['estimator'][1].coef_

array([8.95641209e-03, 1.26178534e+01, 3.23619060e+02, 1.03469648e+01])

In [111]:
from sklearn.ensemble import HistGradientBoostingRegressor

model = HistGradientBoostingRegressor(max_iter=1_000, early_stopping=True)

cv = ShuffleSplit(n_splits=4)
scores_hist = cross_validate(
    model, data, y=target,
    cv=cv, scoring='neg_mean_absolute_error',
    return_estimator=True,
    return_train_score=True,
)

In [112]:
print(
    f"Mean absolute error on the test sets: "
    f"{-scores_hist['test_score'].mean():.3f}"
)

Mean absolute error on the test sets: 43.662


In [114]:
scores_linear

{'fit_time': array([0.03616619, 0.0538435 , 0.08363962, 0.05134916]),
 'score_time': array([0.00469804, 0.00166821, 0.00267482, 0.00453138]),
 'estimator': [RidgeCV(), RidgeCV(), RidgeCV(), RidgeCV()],
 'test_score': array([-71.32824763, -73.33578215, -72.31008067, -71.83808168]),
 'train_score': array([-72.5927553 , -72.38662295, -72.45024204, -72.542963  ])}

In [113]:
scores_hist

{'fit_time': array([0.59966969, 0.43975139, 0.52923584, 0.49590015]),
 'score_time': array([0.01310587, 0.00982738, 0.01554537, 0.01061773]),
 'estimator': [HistGradientBoostingRegressor(early_stopping=True, max_iter=1000),
  HistGradientBoostingRegressor(early_stopping=True, max_iter=1000),
  HistGradientBoostingRegressor(early_stopping=True, max_iter=1000),
  HistGradientBoostingRegressor(early_stopping=True, max_iter=1000)],
 'test_score': array([-42.97334336, -45.35593732, -43.91813654, -42.39997667]),
 'train_score': array([-40.14300543, -41.10829082, -39.50339165, -41.00758613])}