In [None]:
! pip install scikit-learn==1.6.1

In [None]:
%load_ext cudf.pandas

from cuml.svm import LinearSVR as cuLinearSVR
from cuml.linear_model import Ridge as cuRidge

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import TargetEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR, LinearSVR
from sklearn.metrics import make_scorer, root_mean_squared_log_error, root_mean_squared_error
from tqdm import tqdm
from itertools import combinations
import matplotlib.pyplot as plt

In [None]:
users = pd.read_csv("/kaggle/input/playground-series-s5e5/train.csv")
users.columns = [x.lower() for x in users.columns]

assert users['id'].is_unique
# surprising!
assert users.notnull().all().all()

In [None]:
# competition's objective function in mind
users = users.assign(calories_log1p = lambda df_: np.log(df_['calories'] + 1))

In [None]:
COLUMNS_TO_ONEHOT = ['sex']
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='warn').set_output(transform='pandas')
users_onehot = onehot_encoder.fit_transform(users[COLUMNS_TO_ONEHOT])

users = pd.concat([users, users_onehot], axis=1)

In [None]:
COLUMNS_CATEGORICAL = [
    'sex',
    ]

COLUMNS_NUMERIC = [
    'age',
    'height',
    'weight',
    'duration',
    'heart_rate',
    'body_temp',
    'sex_male',
    'sex_female'
    ]

In [None]:
FEATURES_INTERACTED = []

degrees_interaction = [2, 3]

for d in degrees_interaction:
    for features_interact in tqdm(list(combinations(COLUMNS_NUMERIC, d))):

        interaction_title = '_'.join(features_interact)
        FEATURES_INTERACTED.append(interaction_title)
        
        users[interaction_title] = users[list(features_interact)].prod(axis=1)

In [None]:
variances = users.select_dtypes(include='number').var()

columns_zero_variance = variances[variances == 0].index.tolist()

users = users.drop(columns=columns_zero_variance)

FEATURES_INTERACTED = list( set(FEATURES_INTERACTED).difference(set(columns_zero_variance)) )

In [None]:
# test main effects first
# -1.0 makes results decimals; 1 makes all zero ...
degrees_polynomial = [-1.0, 0.5, 2.0]
columns_polynomial = []
for x in COLUMNS_NUMERIC:
    for d in degrees_polynomial:
        series = users[x]**d
        if (all(~np.isinf(series))):
            users[f"{x}^{d}"] = series
            columns_polynomial += [f"{x}^{d}"]

In [None]:
# following basis expansion via interactions, test:
    # new features set size
    # invariant (uninformative) features?
users.shape

In [None]:
XY = users

# Feature Selection

In [None]:
def create_transformers_argument(features_model, features_universe_target_encode):

    transformers = []

    features_target_encode = [ftr for ftr in features_model if ftr in features_universe_target_encode]
    if features_target_encode:
        spec = ('target_encode', TargetEncoder(target_type='continuous',cv=3), features_target_encode)
        transformers.append(spec)

    return transformers

In [None]:
from joblib import Parallel, delayed
import warnings
import time

def fit_evaluate_marginal_model(
    feature_marginal, 
    X_challenger, 
    y, 
    n_jobs):

    features_challenger = list(X_challenger.columns)
    transformers = create_transformers_argument(features_challenger, COLUMNS_CATEGORICAL)
    feature_transform_pipeline = ColumnTransformer(transformers, remainder='passthrough', verbose_feature_names_out=False)
    pipeline_e2e = Pipeline(
        [
            ("transform_features", feature_transform_pipeline), 
            ("standard_scale", StandardScaler()),

            # expected by NVIDIA RAPIDS GPU-enabled model
            ('to_float32', FunctionTransformer(lambda X: X.astype(np.float32))),
            # ('to_float32', Float32Transformer()),
            
            # tested Support Vector Regression, but this was too time-consuming.
            # thought about KernelRidge too, but documentation suggests, equivalent to SVR
            # 'saga' chosen in attempt to resolve Runtime - divide by zero errors
            # ("model", Ridge(alpha=1.0, solver='saga')),
            # ('model', cuLinearSVR())
            ('model', cuRidge())
            
        ]
        )

    scores = cross_val_score(
        pipeline_e2e, 
        X_challenger, 
        y, 
        scoring='neg_root_mean_squared_error', 
        cv=5,  
        n_jobs=n_jobs
        )
    score_cv_summary = scores.mean()

    return (feature_marginal, -1 * score_cv_summary)

In [None]:
# it's strange to suppress a warning, but empirically and from research,
# haven't found the call-to-action from this warning.
# warnings.filterwarnings("ignore", message=".*A worker stopped while some jobs were given to the executor.*", category=UserWarning)
warnings.filterwarnings("ignore", message=".*Changing solver to 'svd' as 'eig' or 'cd' solvers*", category=UserWarning)

start = time.time()


features_marginal = columns_polynomial + COLUMNS_NUMERIC + FEATURES_INTERACTED
# using features universe, model search times practically infeasible
# features_marginal = [x for x in features_marginal if x not in features_low_importance]
features_champion = []
features_added_count = 0

# dummy, to kick off procedure
champion_score = 10_000
challenger_score = 1_000
champions_score_sequence = [champion_score]

# when challenger improves upon champion, continue extending challenger.
# when challenger loses to champion, challenger has become too complex.
while champion_score >= challenger_score:

    challengers_scores = []
    for x in features_marginal:

        X = XY[features_champion + [x]]
        y = XY['calories_log1p']
        
        scores_cv = []
        kf = KFold(n_splits=5, shuffle=True, random_state=777)
        for indexes_train, indexes_test in kf.split(XY):
        
            X_train, X_test = X.loc[indexes_train], X.loc[indexes_test]
            
            y_train, y_test = y.loc[indexes_train], y.loc[indexes_test]

            features_mean = X_train.mean()
            features_std = X_train.std()
            X_train_trfm = (X_train - features_mean) / features_std
            
            model = cuRidge(alpha=0.1)
            model.fit(X_train_trfm, y_train)

            X_test_trfm = (X_test - features_mean) / features_std
            preds = model.predict(X_test_trfm)

            score = root_mean_squared_error(y_test, preds)
            scores_cv.append(score)

        score_summary = np.array(scores_cv).mean()
        
        challengers_scores.append( (x, score_summary) )

    feature_marginal_challenger, challenger_score = min(challengers_scores, key=lambda x: x[1])

    print(f"Challenger score: {challenger_score}")
    print(f"Champion score: {champion_score}")
    elapsed = time.time() - start
    print("Cell run time:", time.strftime("%H:%M:%S", time.gmtime(elapsed)))

    # if challenger improves upon champion, then replace champion with challenger.
    if challenger_score < champion_score:

        features_champion += [feature_marginal_challenger]
        champion_score = challenger_score
        features_marginal.remove(feature_marginal_challenger)
        print(f"{feature_marginal_challenger} selected in this step.")

        features_added_count += 1
        print(f"Model features count comes to {features_added_count}.")

    champions_score_sequence.append(champion_score)

In [None]:
features_champion

In [None]:
FEATURES_SELECTED = features_champion

In [None]:
break

# Fit Pruned Model

In [None]:
# pipeline_e2e = Pipeline(
#     [
#         ('model', RandomForestRegressor(n_estimators=100))
#      ]
# )

scores = cross_val_score(
    RandomForestRegressor(n_estimators=100),
    users[FEATURES_SELECTED],
    users['calories_log1p'],
    cv=5, 
    scoring='neg_root_mean_squared_error',
    verbose=2
)

scores.mean(), scores