In [1]:
from itertools import combinations
import re
import optuna
from sklearn.base import BaseEstimator, clone, RegressorMixin
from sklearn.feature_selection import RFECV
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from data_manipulations import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_features = pl.read_parquet('train_features.parquet')
train_targets = pl.read_parquet('train_targets.parquet')
test_features = pl.read_parquet('test_features.parquet')
test_targets = pl.read_parquet('test_targets.parquet')

data = prepare_data_for_ml(
    train_features, train_targets, test_features, test_targets,
)

In [3]:
def restrict(frame, heads=(25, 20)):
    keys = frame.columns[:2]
    temp = join_many((
        frame.select(keys[0]).unique().sort(by=keys[0]).head(heads[0]),
        frame.select(keys[1]).unique().sort(by=keys[1]).head(heads[1]),
    ), how='cross')
    return frame.join(temp, on=keys)

train_features = restrict(train_features)
train_targets = restrict(train_targets)
test_features = restrict(test_features)
test_targets = restrict(test_targets)

data = prepare_data_for_ml(train_features, train_targets, test_features, test_targets)
data['train_features_rect'].shape, data['train_targets_rect'].shape

((25, 20), (25, 20))

# Model

In [4]:
class SparseSGD(BaseEstimator, RegressorMixin):
    def __init__(
        self,
        alpha=0.005,
        l1_ratio=0.2,
        min_features_to_select=5,
    ):
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        self.model = SGDRegressor(
            penalty='elasticnet',  # 'l2'
            alpha=alpha,  # 0.001
            l1_ratio=l1_ratio,  # 0.15
            fit_intercept=True,  # True
            max_iter=2000,  # 1000
            random_state=0,  # None
            learning_rate='adaptive',  # 'invscaling'
            early_stopping=True,  # False
        )
        self.min_features_to_select = min_features_to_select
        self.models = None

    def fit(self, features, targets):
        self.models = []
        for ind in range(targets.shape[1]):
            rfecv = RFECV(
                estimator=clone(self.model),
                step=1,
                cv=3,
                scoring=scoring,
                min_features_to_select=self.min_features_to_select,
                n_jobs=-1,
                verbose=0,
            )
            rfecv.fit(StandardScaler().fit_transform(features), targets[:, ind])
            self.models.append(rfecv)
        return self

    def predict(self, features):
        scaled_features = StandardScaler().fit_transform(features)
        predictions = [model.predict(scaled_features) for model in self.models]
        return np.stack(predictions, axis=1)

def get_cv_score(kwargs):
    return cross_val_score(
        SparseSGD(**kwargs),
        data['train_features_rect'].to_numpy(),
        data['train_targets_rect'].to_numpy(),
        cv=5,
        scoring=scoring,
        n_jobs=-1,
    ).mean()

# Exploration

In [5]:
%%time

n_trials = 50
progress_bar = tqdm(total=n_trials)

param_grid = {
    'alpha': ['float', 0, 1, {}],
    'l1_ratio': ['float', 0, 1, {}],
    'min_features_to_select': ['int', 1, 50, {}],
}

def objective(trial):
    kwargs = {
        key: getattr(trial, f'suggest_{typ}')(key, *bounds, **options)
        for key, (typ, *bounds, options) in param_grid.items()
    }
    score = get_cv_score(kwargs)
    progress_bar.update(1)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)

  0%|          | 0/50 [00:00<?, ?it/s][I 2025-04-21 19:28:54,854] A new study created in memory with name: no-name-deaed7c8-fda2-4e43-88f1-634f4fd8a19c
  2%|▏         | 1/50 [00:03<02:49,  3.45s/it][I 2025-04-21 19:28:58,308] Trial 0 finished with value: 0.5760246494712404 and parameters: {'alpha': 0.23268963900186135, 'l1_ratio': 0.4004585250382168, 'min_features_to_select': 29}. Best is trial 0 with value: 0.5760246494712404.
  4%|▍         | 2/50 [00:34<15:43, 19.65s/it][I 2025-04-21 19:29:29,289] Trial 1 finished with value: 0.5760246494712404 and parameters: {'alpha': 0.6359748371909683, 'l1_ratio': 0.0899593979513964, 'min_features_to_select': 3}. Best is trial 0 with value: 0.5760246494712404.
  6%|▌         | 3/50 [00:37<09:29, 12.13s/it][I 2025-04-21 19:29:32,465] Trial 2 finished with value: 0.5760246494712404 and parameters: {'alpha': 0.46645387712303943, 'l1_ratio': 0.6360331945075636, 'min_features_to_select': 45}. Best is trial 0 with value: 0.5760246494712404.
  8%|▊    

CPU times: user 1.43 s, sys: 487 ms, total: 1.91 s
Wall time: 5min 53s


In [6]:
prefix = r'^params_'
cv_results = pl.DataFrame(study.trials_dataframe())
cv_results = (
    cv_results
    .rename({
        col: re.sub(prefix, '', col)
        for col in cv_results.columns
    })
)

In [7]:
# for pair in combinations(param_grid.keys(), 2):
#     heatmap_part(cv_results, pair, 'value')

In [8]:
best = (
    cv_results
    .filter(pl.col('value') == pl.col('value').max())
    .select(*param_grid.keys(), 'value')
    .to_dicts()[0]
)
best

{'alpha': 0.003561251647984122,
 'l1_ratio': 0.6413736633402022,
 'min_features_to_select': 27,
 'value': 0.6616214158436657}

In [9]:
best = {
    'alpha': 0.1,
    'l1_ratio': 0.5,
    'min_features_to_select': 5,
}

# Stability, local grid search

In [10]:
param_grid = {
    'alpha': np.linspace(best['alpha'] * 0.8, best['alpha'] * 1.2, 5),
    'l1_ratio': np.linspace(best['l1_ratio'] * 0.8, best['l1_ratio'] * 1.2, 5),
    'min_features_to_select': np.arange(max(best['min_features_to_select'] - 2, 1), best['min_features_to_select'] + 2),
}
param_grid_local = pl.concat([
    join_many([
        pl.DataFrame(v_l, schema=[k_l]),
        pl.DataFrame(v_r, schema=[k_r]),
        pl.DataFrame(best).drop(k_l, k_r),
    ], how='cross').select(best.keys()).with_columns(group=pl.lit(ind))
    for ind, ((k_l, v_l), (k_r, v_r)) in enumerate(combinations(param_grid.items(), 2))
])
group_mapping = dict(enumerate(combinations(param_grid.keys(), 2)))

In [None]:
%%time

iterator = param_grid_local.drop('group').iter_rows(named=True)
scores = [get_cv_score(kwargs) for kwargs in tqdm(iterator, total=param_grid_local.height)]
cv_results = param_grid_local.with_columns(pl.Series(scores).alias('value'))


  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:27<29:17, 27.46s/it][A
  3%|▎         | 2/65 [00:53<28:13, 26.88s/it][A
  5%|▍         | 3/65 [01:21<27:57, 27.05s/it][A
  6%|▌         | 4/65 [01:47<27:16, 26.83s/it][A

In [None]:
for group, part in cv_results.partition_by('group', include_key=False, as_dict=True).items():
    heatmap_part(part, group_mapping[group[0]], 'value')

In [None]:
(
    cv_results
    .filter(pl.col('value') == pl.col('value').max())
    .select(*param_grid.keys(), 'value')
    .to_dicts()
    [0]
)

# Learning curve

In [None]:
model = SparseSGD(**best)

In [None]:
%%time

train_sizes, train_scores, test_scores = learning_curve(
    model,
    data['train_features_rect'].to_numpy(),
    data['train_targets_rect'].to_numpy(),
    cv=5,
    scoring=scoring,
    n_jobs=-1,
)
plot_learning_curve(train_sizes, train_scores, test_scores)

# Score on test

In [None]:
fitted_model, preds = fit_predict_sklearn_model(
    train_features, train_targets, test_features, test_targets,
    model,
)
join_truth_and_score(preds, data['test_days'], test_targets)