In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


In [3]:
def add_advanced_features(df: pd.DataFrame, input_len: int):
    feature_cols = [f"f{i}" for i in range(input_len)]

    x_idx = np.arange(input_len).reshape(-1, 1)
    trends = []
    for row in df[feature_cols].values:
        reg = LinearRegression().fit(x_idx, row)
        trends.append(reg.coef_[0])
    df["trend"] = trends

    df["last_over_mean"] = df[f"f{input_len-1}"] / (df[feature_cols].mean(axis=1) + 1e-8)
    df["diff_mean"] = df[feature_cols].diff(axis=1).mean(axis=1)

    return df

In [4]:
def add_monster_features(df, input_len):
    fcols = [f"f{i}" for i in range(input_len)]

    df["mean"] = df[fcols].mean(axis=1)
    df["std"] = df[fcols].std(axis=1)
    df["min"] = df[fcols].min(axis=1)
    df["max"] = df[fcols].max(axis=1)
    df["last_minus_first"] = df[f"f{input_len-1}"] - df["f0"]
    df["sum"] = df[fcols].sum(axis=1)
    df["median"] = df[fcols].median(axis=1)
    df["range"] = df["max"] - df["min"]
    df["cv"] = df["std"] / (df["mean"] + 1e-8)

    x_idx = np.arange(input_len).reshape(-1, 1)
    trends = []
    for row in df[fcols].values:
        reg = LinearRegression().fit(x_idx, row)
        trends.append(reg.coef_[0])
    df["trend"] = trends

    return df

In [5]:
train_df = pd.read_csv("../data/train_data_new.csv", index_col=0).drop(columns=["2025", "2024"], errors='ignore')
val_df = pd.read_csv("../data/val_data_new.csv", index_col=0).drop(columns=["2025"], errors='ignore')
test_df = pd.read_csv("../data/test_data_new.csv", index_col=0).drop(columns=["2025"], errors='ignore')

In [12]:
test_df

Unnamed: 0_level_0,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
variational inequalities,1075.0,429.0,667.0,776.0,888.0,1486.0,1880.0,1929.0,2752.0,2980.0,2946.0,3282.0,3909.0,4738.0,4964.0,4776.0,5410.0,6861.0
gesture recognition,132.0,42.0,84.0,120.0,124.0,207.0,314.0,420.0,600.0,852.0,891.0,1030.0,1185.0,1351.0,1368.0,1366.0,1509.0,2138.0
interval models,1050.0,324.0,524.0,614.0,707.0,1269.0,1844.0,1353.0,1842.0,1960.0,1772.0,1851.0,2069.0,2479.0,2557.0,2504.0,2745.0,3617.0
delta debugging,821.0,212.0,302.0,339.0,396.0,696.0,1008.0,858.0,1090.0,1263.0,1433.0,1831.0,2259.0,2745.0,3191.0,3005.0,3771.0,5579.0
railway control,9.0,5.0,11.0,9.0,7.0,18.0,19.0,35.0,47.0,56.0,64.0,78.0,83.0,89.0,123.0,122.0,150.0,229.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
on-device signal processing,643.0,265.0,427.0,474.0,540.0,874.0,1246.0,1397.0,2014.0,2673.0,2395.0,2555.0,2790.0,3095.0,3387.0,3337.0,3780.0,5403.0
xslt programming and evaluation,1480.0,382.0,666.0,780.0,936.0,1298.0,1695.0,1816.0,2182.0,2340.0,2455.0,2925.0,3516.0,4305.0,4892.0,5010.0,6844.0,12620.0
mobility models,656.0,235.0,440.0,483.0,552.0,987.0,1278.0,1158.0,1507.0,1877.0,1654.0,1723.0,1712.0,2045.0,2005.0,1877.0,2097.0,2927.0
chaos,883.0,283.0,460.0,515.0,521.0,877.0,1173.0,1122.0,1481.0,1777.0,1717.0,1862.0,2197.0,2549.0,2718.0,2589.0,2931.0,3993.0


In [6]:
input_len = 5
output_len = 1
random_seed = 42

def make_sliding_samples(df, input_len, output_len):
    df.columns = df.columns.astype(str)
    X, Y = [], []
    for idx in df.index:
        values = df.loc[idx].values.astype(float)
        for i in range(len(values) - input_len - output_len + 1):
            x = values[i : i + input_len]
            y = values[i + input_len : i + input_len + output_len]
            X.append(x)
            Y.append(y)
    X = np.array(X)
    Y = np.array(Y)
    feature_cols = [f"f{i}" for i in range(input_len)]
    target_cols = [f"y{i+1}" for i in range(output_len)]
    data = pd.DataFrame(np.hstack([X, Y]), columns=feature_cols + target_cols)
    return data

In [7]:
train_data = make_sliding_samples(train_df, input_len, output_len)
val_data = make_sliding_samples(val_df, input_len, output_len)
test_data = make_sliding_samples(test_df, input_len, output_len)

In [8]:
train_data

Unnamed: 0,f0,f1,f2,f3,f4,y1
0,1826.0,501.0,854.0,968.0,1132.0,1840.0
1,501.0,854.0,968.0,1132.0,1840.0,2362.0
2,854.0,968.0,1132.0,1840.0,2362.0,2482.0
3,968.0,1132.0,1840.0,2362.0,2482.0,3150.0
4,1132.0,1840.0,2362.0,2482.0,3150.0,3668.0
...,...,...,...,...,...,...
94315,4114.0,5930.0,6783.0,7094.0,8316.0,9776.0
94316,5930.0,6783.0,7094.0,8316.0,9776.0,11866.0
94317,6783.0,7094.0,8316.0,9776.0,11866.0,12672.0
94318,7094.0,8316.0,9776.0,11866.0,12672.0,13037.0


In [9]:
def make_test_2024_samples(df, input_len, output_len):
    df.columns = df.columns.astype(str)
    index_2022 = df.columns.get_loc("2024")
    X, Y = [], []
    for idx in df.index:
        values = df.loc[idx].values.astype(float)
        if len(values) >= input_len + output_len:
            x = values[index_2022-input_len:index_2022]  # Use years before 2022 as input
            y = values[index_2022:index_2022+output_len]  # Use years 2022, 2023, and 2024 as output
            X.append(x)
            Y.append(y)
    X = np.array(X)
    Y = np.array(Y)
    feature_cols = [f"f{i}" for i in range(input_len)]
    target_cols = [f"y{i+1}" for i in range(output_len)]
    data = pd.DataFrame(np.hstack([X, Y]), columns=feature_cols + target_cols)
    return data

test_2024_data = make_test_2024_samples(test_df, input_len, output_len)
test_2024_data

Unnamed: 0,f0,f1,f2,f3,f4,y1
0,3909.0,4738.0,4964.0,4776.0,5410.0,6861.0
1,1185.0,1351.0,1368.0,1366.0,1509.0,2138.0
2,2069.0,2479.0,2557.0,2504.0,2745.0,3617.0
3,2259.0,2745.0,3191.0,3005.0,3771.0,5579.0
4,83.0,89.0,123.0,122.0,150.0,229.0
...,...,...,...,...,...,...
982,2790.0,3095.0,3387.0,3337.0,3780.0,5403.0
983,3516.0,4305.0,4892.0,5010.0,6844.0,12620.0
984,1712.0,2045.0,2005.0,1877.0,2097.0,2927.0
985,2197.0,2549.0,2718.0,2589.0,2931.0,3993.0


In [10]:
def add_stat_features(df: pd.DataFrame, input_len: int):
    feature_cols = [f"f{i}" for i in range(input_len)]
    df["mean"] = df[feature_cols].mean(axis=1)
    df["std"] = df[feature_cols].std(axis=1)
    df["min"] = df[feature_cols].min(axis=1)
    df["max"] = df[feature_cols].max(axis=1)
    df["last_minus_first"] = df[f"f{input_len-1}"] - df["f0"]
    return df

train_data = add_stat_features(train_data, input_len)
val_data = add_stat_features(val_data, input_len)
test_data = add_stat_features(test_data, input_len)
test_2024_data = add_stat_features(test_2024_data, input_len)

In [11]:
train_data = add_advanced_features(train_data, input_len)
val_data = add_advanced_features(val_data, input_len)
test_data = add_advanced_features(test_data, input_len)
test_2024_data = add_advanced_features(test_2024_data, input_len)

In [12]:
train_data = add_monster_features(train_data, input_len)
val_data = add_monster_features(val_data, input_len)
test_data = add_monster_features(test_data, input_len)
test_2024_data = add_monster_features(test_2024_data, input_len)

In [13]:
columns_to_drop = [f"y{i+1}" for i in range(output_len)]
X_train = train_data.drop(columns=columns_to_drop)
y_train = train_data[columns_to_drop]
X_val = val_data.drop(columns=columns_to_drop)
y_val = val_data[columns_to_drop]
X_test = test_data.drop(columns=columns_to_drop)
y_test = test_data[columns_to_drop]
X_test_2024 = test_2024_data.drop(columns=columns_to_drop)
y_test_2024 = test_2024_data[columns_to_drop]

In [14]:
def train_one_target(y_train, y_val, y_test, label):
    model = XGBRegressor(
        n_estimators=3000,
        learning_rate=0.01,
        max_depth=10,
        subsample=1.0,
        colsample_bytree=1.0,
        reg_alpha=0.0,
        reg_lambda=0.0,
        min_child_weight=1,
        objective="reg:squarederror",
        random_state=random_seed,
        n_jobs=-1,
        early_stopping_rounds=50
    )
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    preds = model.predict(X_test_2024)
    mse = mean_squared_error(y_test, preds)
    print(f"[{label}] Test RMSE: {mse ** 0.5:.4f}")

    # Save predictions alongside ground truth
    results = pd.DataFrame({'Ground Truth': y_test, 'Predictions': preds})
    results.to_csv(f'predictions_{label}.csv', index=False)

    return model

model_y1 = train_one_target(y_train["y1"], y_val["y1"], y_test_2024["y1"], "T+1")
# model_y2 = train_one_target(y_train["y2"], y_val["y2"], y_test_2024["y2"], "T+2")
# model_y3 = train_one_target(y_train["y3"], y_val["y3"], y_test_2024["y3"], "T+3")

[T+1] Test RMSE: 2975.6817
