In [None]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from tpot import TPOTRegressor
from tpot.config import regressor_config_dict 
from sklearn.model_selection import cross_val_score

from challenge.preprocessing import build_new_features


In [None]:
VERBOSE = True
EXPORT = False

In [None]:
current_path = Path.cwd()
data_path = current_path / "data"
output_path = current_path / "outputs"
output_path.mkdir(parents=True, exist_ok=True)

In [None]:
data = pd.read_csv(data_path / "train.csv", parse_dates=["date"], index_col=["date"]).sort_index()

test = pd.read_csv(data_path / "test.csv", parse_dates=["date"], index_col=["date"]).sort_index()

In [None]:
data = build_new_features(data)

test = build_new_features(test)

In [None]:
train, validation = train_test_split(data, test_size=0.2, shuffle=True)

In [None]:
X_train = train.drop("wp1", axis=1)
y_train = train["wp1"]

X_val = validation.drop("wp1", axis=1)
y_val = validation["wp1"]

In [None]:
regressor_config_dict["lightgbm.LGBMRegressor"] = {
    "boosting_type": ["gbdt", "dart"],
    "min_child_samples": [1, 5, 7, 10, 15, 20, 35, 50, 100, 200, 500, 1000],
    "num_leaves": [2, 4, 7, 10, 15, 20, 25, 30, 35, 40, 50, 65, 80, 100, 125, 150, 200, 250, 500], 
    "colsample_bytree": [0.7, 0.9, 1.0],
    "subsample": [0.7, 0.9, 1.0],
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [5, 20, 35, 50, 75, 100, 150, 200, 350, 500, 750, 1000, 1500, 2000]
}

In [None]:
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [None]:
pipeline_optimizer = TPOTRegressor(
    generations=1,
    population_size=5,
    scoring=mae_scorer,
    config_dict=regressor_config_dict,
    verbosity=2,
    n_jobs=-1
)

In [None]:
selected_features = ["ws", "sin_wd", "hour", "day", "month"]

In [None]:
pipeline_optimizer.fit(X_train[selected_features], y_train)

In [None]:
pipeline_optimizer.score(X_train[selected_features], y_train)

In [None]:
pipeline_optimizer.score(X_val[selected_features], y_val)

In [None]:
pipeline_optimizer.export('tpot_exported_pipeline.py')