In [None]:
# !pip install -U numpy scikit-learn pandas xgboost lightgbm category_encoders matplotlib seaborn cloudpickle

In [1]:
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [2]:
import pandas as pd

# Assume a dummy year since it's not provided
year = 2023

# Construct datetime from day_of_year and hour
train_df['datetime'] = pd.to_datetime(train_df['day_of_year'], format='%j', errors='coerce') \
                       + pd.to_timedelta(train_df['hour'], unit='h')
train_df['datetime'] = train_df['datetime'].apply(
    lambda dt: dt.replace(year=year) if pd.notnull(dt) else dt
)

# Sort by datetime column
train_df = train_df.sort_values(by='datetime')

# Drop the temporary datetime column
train_df = train_df.drop(columns='datetime')

In [3]:
X_train = train_df.drop(["id", "pollution_value"], axis=1)
y_train = train_df["pollution_value"]

X_test = test_df

In [None]:
from tabularaml.generate.features import FeatureGenerator
from tabularaml.eval.scorers import PREDEFINED_REG_SCORERS, Scorer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
from xgboost import XGBRegressor

def rmse_exp(y_true, y_pred):
    return np.exp(-root_mean_squared_error(y_true, y_pred)/100)

rmse_exp_scorer = Scorer(name = "rmse_exp",
                         scorer = rmse_exp,
                         greater_is_better = True,
                         extra_params = {},
                         from_probs = False)

tscv = TimeSeriesSplit(
    n_splits=5,
)

generator = FeatureGenerator(task = "regression", scorer=rmse_exp_scorer, max_gen_new_feats_pct=1000,
                             cv=tscv,
                            n_generations = 10)

In [5]:
results = generator.generate(X_train, y_train)
generator.save("model/feature_generator.pkl")

Starting feature generation - Task: regression, Device: cuda
Dataset: 7649 samples, 6 features, Limit: 1000 new features max
Gen 0: Training with original features. Mean Train rmse_exp = 0.79358, Mean Val rmse_exp = 0.59367


Generations:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating features:   0%|          | 0/150 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
loaded_fg = FeatureGenerator.load("model/feature_generator.pkl", X_test)
X_train_transformed = loaded_fg.transform(X_train)
X_train_transformed

In [None]:
from tabularaml.inspect.importance import FeatureImportanceAnalyzer

analyzer = FeatureImportanceAnalyzer(cv = tscv)

analyzer.fit(X_train, y_train)

analyzer.plot_feature_importance_dashboard()

<tabularaml.inspect.importance.FeatureImportanceAnalyzer at 0x152f85c70d0>