In [61]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")

In [62]:
df = pd.read_csv("../../data/frames_errors.csv", header=None)
df.columns = [
    "block_id",
    "frame_idx",
    "E_mu_Z",
    "E_mu_phys_est",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "nTot",
    "bayesImVoltage",
    "opticalPower",
    "polarizerVoltages[0]",
    "polarizerVoltages[1]",
    "polarizerVoltages[2]",
    "polarizerVoltages[3]",
    "temp_1",
    "biasVoltage_1",
    "temp_2",
    "biasVoltage_2",
    "synErr",
    "N_EC_rounds",
    "maintenance_flag",
    "estimator_name",
    "f_EC",
    "E_mu_Z_est",
    "R",
    "s",
    "p",
]

In [63]:
id_features = [
    "block_id",
    "frame_idx",
]

phys_gt_features = [
    "E_mu_Z",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
]

phys_features = [
    "bayesImVoltage",
    "opticalPower",
    "polarizerVoltages[0]",
    "polarizerVoltages[1]",
    "polarizerVoltages[2]",
    "polarizerVoltages[3]",
    "temp_1",
    "biasVoltage_1",
    "temp_2",
    "biasVoltage_2",
]

est_features = [
    # "E_mu_phys_est",
    "E_mu_Z_est",
    "R",
    "s",
    "p",
]

proxy_features = [
    "N_EC_rounds",
    # "f_EC",
]

df = df[id_features + phys_gt_features + phys_features + est_features + proxy_features]

In [64]:
# df = df.dropna(subset=["f_EC"])

In [65]:
missing_counts = df.isna().sum()
columns_with_missing = missing_counts[missing_counts > 0]
print("Столбцы с пропусками и их количество:")
print(columns_with_missing)
timestamp_counts = df.groupby("block_id")["frame_idx"].nunique()
print("Количество фреймов / Количество рядов")
print(timestamp_counts.value_counts())
print("Shape:", df.shape)

Столбцы с пропусками и их количество:
Series([], dtype: int64)
Количество фреймов / Количество рядов
frame_idx
399    569
400    251
398      2
390      1
Name: count, dtype: int64
Shape: (328617, 44)


In [66]:
def create_features(df, features, lag, window):
    df = df.copy()
    created_features = []

    for feature in features:
        for l in lag:
            df[f"{feature}_lag_{l}"] = df.groupby("block_id")[feature].shift(l)
            created_features.append(f"{feature}_lag_{l}")
        for w in window:
            df[f"{feature}_roll_mean_{w}"] = (
                df.groupby("block_id")[feature].shift(1).rolling(window=w).mean()
            )
            created_features.append(f"{feature}_roll_mean_{w}")
            df[f"{feature}_roll_std_{w}"] = (
                df.groupby("block_id")[feature].shift(1).rolling(window=w).std()
            )
            created_features.append(f"{feature}_roll_std_{w}")

    df = df.bfill().ffill()
    return df, created_features

In [67]:
features_to_create = phys_gt_features + phys_features + est_features
df, created_features = create_features(
    df, features_to_create, lag=[1, 2, 3, 5, 10], window=[3, 5, 10]
)
print("Созданные признаки:", created_features)

Созданные признаки: ['E_mu_Z_lag_1', 'E_mu_Z_lag_2', 'E_mu_Z_lag_3', 'E_mu_Z_lag_5', 'E_mu_Z_lag_10', 'E_mu_Z_roll_mean_3', 'E_mu_Z_roll_std_3', 'E_mu_Z_roll_mean_5', 'E_mu_Z_roll_std_5', 'E_mu_Z_roll_mean_10', 'E_mu_Z_roll_std_10', 'E_mu_X_lag_1', 'E_mu_X_lag_2', 'E_mu_X_lag_3', 'E_mu_X_lag_5', 'E_mu_X_lag_10', 'E_mu_X_roll_mean_3', 'E_mu_X_roll_std_3', 'E_mu_X_roll_mean_5', 'E_mu_X_roll_std_5', 'E_mu_X_roll_mean_10', 'E_mu_X_roll_std_10', 'E_nu1_X_lag_1', 'E_nu1_X_lag_2', 'E_nu1_X_lag_3', 'E_nu1_X_lag_5', 'E_nu1_X_lag_10', 'E_nu1_X_roll_mean_3', 'E_nu1_X_roll_std_3', 'E_nu1_X_roll_mean_5', 'E_nu1_X_roll_std_5', 'E_nu1_X_roll_mean_10', 'E_nu1_X_roll_std_10', 'E_nu2_X_lag_1', 'E_nu2_X_lag_2', 'E_nu2_X_lag_3', 'E_nu2_X_lag_5', 'E_nu2_X_lag_10', 'E_nu2_X_roll_mean_3', 'E_nu2_X_roll_std_3', 'E_nu2_X_roll_mean_5', 'E_nu2_X_roll_std_5', 'E_nu2_X_roll_mean_10', 'E_nu2_X_roll_std_10', 'E_nu1_Z_lag_1', 'E_nu1_Z_lag_2', 'E_nu1_Z_lag_3', 'E_nu1_Z_lag_5', 'E_nu1_Z_lag_10', 'E_nu1_Z_roll_mean_3', 

In [68]:
# Test (Val)
start_idx = df[(df["block_id"] == 1489460492) & (df["frame_idx"] == 99)].index[0]
end_idx = df[(df["block_id"] == 1840064900) & (df["frame_idx"] == 101)].index[0]

test_df = df.loc[start_idx:end_idx].copy()
assert len(test_df) == 2000, "Test (Val) set size is not 2000 rows"

# Train
all_block_ids = df["block_id"].unique()
train_blocks = [bid for bid in all_block_ids if bid not in test_df["block_id"].values]
train_df = df[df["block_id"].isin(train_blocks)]

print(f"Train: {len(train_df)}")
print(f"Test: {len(test_df)}")


Train: 326221
Test: 2000


In [69]:
train_df = train_df[(train_df["N_EC_rounds"] <= 1)]

# Catboost E_mu_Z

In [70]:
features_in = created_features + phys_features

X_train = train_df[features_in]
y_train = train_df["E_mu_Z_est"]
X_test = test_df[features_in]
y_test = test_df["E_mu_Z_est"]

In [None]:
model_E_mu_Z = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.05,
    loss_function="RMSE",
    verbose=100,
    random_state=42,
)

model_E_mu_Z.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
)

0:	learn: 0.0125367	test: 0.0033755	best: 0.0033755 (0)	total: 35.9ms	remaining: 1m 11s
100:	learn: 0.0005656	test: 0.0002376	best: 0.0002376 (100)	total: 2.35s	remaining: 44.3s
200:	learn: 0.0004044	test: 0.0001976	best: 0.0001976 (200)	total: 4.9s	remaining: 43.9s
300:	learn: 0.0003287	test: 0.0001660	best: 0.0001660 (300)	total: 7.2s	remaining: 40.6s
400:	learn: 0.0002865	test: 0.0001463	best: 0.0001463 (400)	total: 9.34s	remaining: 37.3s
500:	learn: 0.0002515	test: 0.0001358	best: 0.0001357 (498)	total: 11.7s	remaining: 35.1s
600:	learn: 0.0002270	test: 0.0001250	best: 0.0001250 (600)	total: 14.4s	remaining: 33.5s
700:	learn: 0.0002071	test: 0.0001184	best: 0.0001184 (700)	total: 17.1s	remaining: 31.7s
800:	learn: 0.0001918	test: 0.0001135	best: 0.0001135 (799)	total: 19.6s	remaining: 29.4s
900:	learn: 0.0001792	test: 0.0001100	best: 0.0001100 (900)	total: 22.1s	remaining: 27s
1000:	learn: 0.0001684	test: 0.0001073	best: 0.0001073 (1000)	total: 24.6s	remaining: 24.5s
1100:	learn: 0

In [None]:
E_mu_Z_pred = model_E_mu_Z.predict(X_test)

# CatBoost R

In [None]:
features_in = created_features + phys_features + ["E_mu_Z_est"]

X_train = train_df[features_in]
y_train = train_df["R"]
X_test = test_df[features_in]
y_test = test_df["R"]

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
print(dict(zip(le.classes_, le.transform(le.classes_))))

{0.5: 0, 0.55: 1, 0.6: 2, 0.65: 3, 0.7: 4, 0.75: 5, 0.8: 6, 0.85: 7}


In [None]:
model_R = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    verbose=100,
    random_state=42,
)

model_R.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
)

0:	learn: 0.9199096	test: 0.9005000	best: 0.9005000 (0)	total: 352ms	remaining: 2m 55s
100:	learn: 0.9851952	test: 0.9820000	best: 0.9820000 (100)	total: 28.9s	remaining: 1m 54s
200:	learn: 0.9909151	test: 0.9865000	best: 0.9870000 (196)	total: 52.5s	remaining: 1m 18s
300:	learn: 0.9930728	test: 0.9890000	best: 0.9890000 (243)	total: 1m 13s	remaining: 48.5s
400:	learn: 0.9944534	test: 0.9905000	best: 0.9905000 (390)	total: 1m 36s	remaining: 23.9s
499:	learn: 0.9953440	test: 0.9910000	best: 0.9910000 (426)	total: 1m 57s	remaining: 0us

bestTest = 0.991
bestIteration = 426

Shrink model to first 427 iterations.


<catboost.core.CatBoostClassifier at 0x77e5a7a9e000>

In [None]:
R_pred_labels = model_R.predict(X_test).flatten()

R_pred = le.inverse_transform(R_pred_labels)

# CatBoost s p

In [None]:
features_in = phys_features + created_features + ["E_mu_Z_est", "R"]

X_train = train_df[features_in]
y_train = train_df["s"]
X_test = test_df[features_in]
y_test = test_df["s"]

In [None]:
model_s_p = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.04,
    loss_function="RMSE",
    verbose=100,
    random_state=42,
)

model_s_p.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
)

0:	learn: 567.3858332	test: 595.8490326	best: 595.8490326 (0)	total: 39.7ms	remaining: 1m 19s
100:	learn: 60.5236832	test: 26.7697550	best: 26.7697550 (100)	total: 3.26s	remaining: 1m 1s
200:	learn: 34.7342689	test: 14.9846589	best: 14.9846589 (200)	total: 6.08s	remaining: 54.4s
300:	learn: 26.1523457	test: 12.7061675	best: 12.7060989 (299)	total: 9.2s	remaining: 51.9s
400:	learn: 22.4791775	test: 10.5920983	best: 10.5901045 (397)	total: 12.6s	remaining: 50.1s
500:	learn: 20.2894504	test: 9.7277804	best: 9.7277804 (500)	total: 15.5s	remaining: 46.4s
600:	learn: 18.5538885	test: 8.7610229	best: 8.7610229 (600)	total: 18.8s	remaining: 43.7s
700:	learn: 17.0724616	test: 8.1422911	best: 8.1417661 (699)	total: 21.9s	remaining: 40.6s
800:	learn: 15.9663219	test: 7.6497517	best: 7.6495527 (799)	total: 24.9s	remaining: 37.3s
900:	learn: 15.1702432	test: 7.2328631	best: 7.2328631 (900)	total: 28s	remaining: 34.2s
1000:	learn: 14.2482643	test: 6.9420449	best: 6.9420449 (1000)	total: 30.9s	remain

<catboost.core.CatBoostRegressor at 0x77e5a4dd19a0>

In [None]:
s_pred_float = model_s_p.predict(X_test)
s_pred_int = np.round(s_pred_float).astype(int)

In [None]:
p_pred_int = 4800 - s_pred_int

In [None]:
pd.DataFrame(data={"E_mu_Z": E_mu_Z_pred, "R": R_pred, "s": s_pred_int, "p": p_pred_int}).to_csv(
    "submission.csv", header=False, index=False
)