# Импорт библиотек

In [1]:
import warnings
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# Подготовка данных

In [2]:
df = pd.read_csv("../../data/frames_errors.csv", header=None)
df.columns = [
    "block_id",
    "frame_idx",
    "E_mu_Z",
    "E_mu_phys_est",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "nTot",
    "bayesImVoltage",
    "opticalPower",
    "polarizerVoltages[0]",
    "polarizerVoltages[1]",
    "polarizerVoltages[2]",
    "polarizerVoltages[3]",
    "temp_1",
    "biasVoltage_1",
    "temp_2",
    "biasVoltage_2",
    "synErr",
    "N_EC_rounds",
    "maintenance_flag",
    "estimator_name",
    "f_EC",
    "E_mu_Z_est",
    "R",
    "s",
    "p",
]

In [75]:
missing_counts = df.isna().sum()
columns_with_missing = missing_counts[missing_counts > 0]
print("Столбцы с пропусками и их количество:")
print(columns_with_missing)
timestamp_counts = df.groupby("block_id")["frame_idx"].nunique()
print("Количество фреймов / Количество рядов")
print(timestamp_counts.value_counts())
print("Shape:", df.shape)

Столбцы с пропусками и их количество:
E_mu_phys_est    445
dtype: int64
Количество фреймов / Количество рядов
frame_idx
399    510
400    238
398     48
397     18
396      4
393      2
395      2
386      1
Name: count, dtype: int64
Shape: (328486, 50)


In [4]:
df = df.dropna(subset=["f_EC"])

In [5]:
# Test (Val)
start_idx = df[(df["block_id"] == 1489460492) & (df["frame_idx"] == 99)].index[0]
end_idx = df[(df["block_id"] == 1840064900) & (df["frame_idx"] == 101)].index[0]

test_df = df.loc[start_idx:end_idx].copy()
assert len(test_df) == 2000, "Test (Val) set size is not 2000 rows"

# Train
all_block_ids = df["block_id"].unique()
train_blocks = [bid for bid in all_block_ids if bid not in test_df["block_id"].values]
train_df = df[df["block_id"].isin(train_blocks)]

print(f"Train: {len(train_df)}")
print(f"Test: {len(test_df)}")


Train: 326090
Test: 2000


In [6]:
phys_features = [
    "bayesImVoltage",
    "opticalPower",
    "polarizerVoltages[0]",
    "polarizerVoltages[1]",
    "polarizerVoltages[2]",
    "polarizerVoltages[3]",
    "temp_1",
    "biasVoltage_1",
    "temp_2",
    "biasVoltage_2",
]

features_to_lag = phys_features + [
    "E_mu_Z_est",
    "E_mu_Z",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
]

In [7]:
def create_lag_features(df, features_to_lag, lags):
    df = df.copy()
    for lag in range(1, lags + 1):
        for feature in features_to_lag:
            df[f"{feature}_lag{lag}"] = df.groupby("block_id")[feature].shift(lag)
        # df = df.dropna(subset=[f"{feature}_lag{lag}" for feature in features_to_lag])
    df = df.fillna(0)
    return df

In [8]:
train_df = create_lag_features(train_df, features_to_lag, lags=5)
test_df_lag = create_lag_features(test_df, features_to_lag, lags=5)

# Catboost E_mu_Z

In [47]:
import numpy as np
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [48]:
features_to_drop = [
    "E_mu_Z_est",
    "E_mu_Z",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "block_id",
    "frame_idx",
    "estimator_name",
    "maintenance_flag",
    "synErr",
    "N_EC_rounds",
    "f_EC",
    "R",
    "s",
    "p",
    "nTot",
    "E_mu_phys_est",
]

In [11]:
max_rounds = train_df["N_EC_rounds"].max()
min_rounds = train_df["N_EC_rounds"].min()
train_df["N_EC_rounds_norm"] = (train_df["N_EC_rounds"] - min_rounds) / (
    max_rounds - min_rounds + 1e-6
)
train_df["penalty_score"] = train_df["f_EC"] + train_df["N_EC_rounds_norm"]
train_df["optimality_weight"] = 1 / train_df["penalty_score"]

In [15]:
score_feat = ["penalty_score", "N_EC_rounds_norm", "optimality_weight"]

In [None]:
X_train = train_df.drop(columns=features_to_drop + score_feat)
y_train = train_df["E_mu_Z_est"]
X_test = test_df_lag.drop(columns=features_to_drop)
y_test = test_df_lag["E_mu_Z_est"]

In [50]:
model_E_mu_Z = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    loss_function="RMSE",
    verbose=100,
    random_state=42,
)

In [51]:
model_E_mu_Z.fit(
    X_train,
    y_train,
    sample_weight=train_df["optimality_weight"],
    eval_set=(X_test, y_test),
)

0:	learn: 0.0112461	test: 0.0024588	best: 0.0024588 (0)	total: 30.2ms	remaining: 30.2s
100:	learn: 0.0009706	test: 0.0001523	best: 0.0001523 (100)	total: 2.39s	remaining: 21.3s
200:	learn: 0.0008795	test: 0.0001359	best: 0.0001355 (193)	total: 4.66s	remaining: 18.5s
300:	learn: 0.0008040	test: 0.0001279	best: 0.0001279 (299)	total: 7.23s	remaining: 16.8s
400:	learn: 0.0007350	test: 0.0001206	best: 0.0001206 (400)	total: 9.88s	remaining: 14.8s
500:	learn: 0.0006851	test: 0.0001162	best: 0.0001162 (500)	total: 12.3s	remaining: 12.2s
600:	learn: 0.0006384	test: 0.0001201	best: 0.0001155 (523)	total: 14.8s	remaining: 9.81s
700:	learn: 0.0006067	test: 0.0001184	best: 0.0001155 (523)	total: 17.3s	remaining: 7.39s
800:	learn: 0.0005698	test: 0.0001210	best: 0.0001155 (523)	total: 19.9s	remaining: 4.95s
900:	learn: 0.0005363	test: 0.0001221	best: 0.0001155 (523)	total: 22.4s	remaining: 2.46s
999:	learn: 0.0005080	test: 0.0001304	best: 0.0001155 (523)	total: 24.9s	remaining: 0us

bestTest = 0.0

<catboost.core.CatBoostRegressor at 0x78062c3b5bb0>

In [21]:
E_mu_Z_pred = model_E_mu_Z.predict(X_test)

# CatBoost R

In [22]:
features_to_drop = [
    "E_mu_Z",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "block_id",
    "frame_idx",
    "estimator_name",
    "maintenance_flag",
    "synErr",
    "N_EC_rounds",
    "f_EC",
    "R",
    "s",
    "p",
    "nTot",
    "E_mu_phys_est",
]

In [23]:
X_train = train_df.drop(columns=features_to_drop + score_feat)
y_train = train_df["R"]
X_test = test_df_lag.drop(columns=features_to_drop)
y_test = test_df_lag["R"]

In [27]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
print(dict(zip(le.classes_, le.transform(le.classes_))))

{0.5: 0, 0.55: 1, 0.6: 2, 0.65: 3, 0.7: 4, 0.75: 5, 0.8: 6, 0.85: 7}


In [29]:
model_R = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    verbose=100,
    random_state=42,
)

model_R.fit(
    X_train,
    y_train,
    sample_weight=train_df["optimality_weight"],
    eval_set=(X_test, y_test),
)

0:	learn: 0.9949187	test: 1.0000000	best: 1.0000000 (0)	total: 248ms	remaining: 4m 8s
100:	learn: 0.9975448	test: 1.0000000	best: 1.0000000 (0)	total: 20.4s	remaining: 3m 1s
200:	learn: 0.9980505	test: 1.0000000	best: 1.0000000 (0)	total: 37.1s	remaining: 2m 27s
300:	learn: 0.9986070	test: 1.0000000	best: 1.0000000 (0)	total: 53.4s	remaining: 2m 3s
400:	learn: 0.9988640	test: 1.0000000	best: 1.0000000 (0)	total: 1m 9s	remaining: 1m 43s
500:	learn: 0.9989888	test: 1.0000000	best: 1.0000000 (0)	total: 1m 24s	remaining: 1m 24s
600:	learn: 0.9991756	test: 1.0000000	best: 1.0000000 (0)	total: 1m 40s	remaining: 1m 6s
700:	learn: 0.9993575	test: 1.0000000	best: 1.0000000 (0)	total: 1m 55s	remaining: 49.3s
800:	learn: 0.9994460	test: 1.0000000	best: 1.0000000 (0)	total: 2m 10s	remaining: 32.4s
900:	learn: 0.9995059	test: 1.0000000	best: 1.0000000 (0)	total: 2m 26s	remaining: 16.1s
999:	learn: 0.9995947	test: 1.0000000	best: 1.0000000 (0)	total: 2m 42s	remaining: 0us

bestTest = 1
bestIteration

<catboost.core.CatBoostClassifier at 0x78062c395bb0>

In [None]:
R_pred_labels = model_R.predict(X_test).flatten()

R_pred = le.inverse_transform(R_pred_labels)

CatBoostError: catboost/libs/data/model_dataset_compatibility.cpp:81: At position 10 should be feature with name E_mu_Z_est (found R).

# CatBoost s p

In [53]:
features_to_drop = [
    "E_mu_Z_est",
    "E_mu_Z",
    "E_mu_X",
    "E_nu1_X",
    "E_nu2_X",
    "E_nu1_Z",
    "E_nu2_Z",
    "N_mu_X",
    "M_mu_XX",
    "M_mu_XZ",
    "M_mu_X",
    "N_mu_Z",
    "M_mu_ZZ",
    "M_mu_Z",
    "N_nu1_X",
    "M_nu1_XX",
    "M_nu1_XZ",
    "M_nu1_X",
    "N_nu1_Z",
    "M_nu1_ZZ",
    "M_nu1_Z",
    "N_nu2_X",
    "M_nu2_XX",
    "M_nu2_XZ",
    "M_nu2_X",
    "N_nu2_Z",
    "M_nu2_ZZ",
    "M_nu2_Z",
    "block_id",
    "frame_idx",
    "estimator_name",
    "maintenance_flag",
    "synErr",
    "N_EC_rounds",
    "f_EC",
    "s",
    "p",
    "nTot",
    "E_mu_phys_est",
]

In [None]:
X_train = train_df.drop(columns=features_to_drop + score_feat)
y_train = train_df["s"]
X_test = test_df_lag.drop(columns=features_to_drop)
y_test = test_df_lag["s"]

In [52]:
model_s_p = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    loss_function="RMSE",
    verbose=100,
    random_state=42,
)

In [57]:
model_s_p.fit(
    X_train,
    y_train,
    sample_weight=train_df["optimality_weight"],
    eval_set=(X_test, y_test),
)

0:	learn: 542.1752133	test: 591.7669787	best: 591.7669787 (0)	total: 37.3ms	remaining: 37.2s
100:	learn: 61.4803885	test: 26.6004258	best: 26.6004258 (100)	total: 2.8s	remaining: 25s
200:	learn: 38.7142615	test: 16.0980702	best: 16.0980702 (200)	total: 5.07s	remaining: 20.1s
300:	learn: 33.0135414	test: 12.9552523	best: 12.9552523 (300)	total: 7.25s	remaining: 16.8s
400:	learn: 30.2461218	test: 11.4428745	best: 11.4428745 (400)	total: 9.52s	remaining: 14.2s
500:	learn: 28.3713122	test: 10.7909325	best: 10.7909325 (500)	total: 11.9s	remaining: 11.9s
600:	learn: 26.9542270	test: 10.1372622	best: 10.1372622 (600)	total: 14.5s	remaining: 9.62s
700:	learn: 25.8664709	test: 9.6497914	best: 9.6474599 (699)	total: 16.9s	remaining: 7.21s
800:	learn: 24.9790550	test: 9.4046563	best: 9.4030859 (797)	total: 19.1s	remaining: 4.75s
900:	learn: 24.2587953	test: 9.2533400	best: 9.2533400 (900)	total: 21.4s	remaining: 2.35s
999:	learn: 23.5204420	test: 9.1527274	best: 9.1342389 (997)	total: 23.6s	remai

<catboost.core.CatBoostRegressor at 0x780626f43560>

In [58]:
s_pred_float = model_s_p.predict(X_test)
s_pred_int = np.round(s_pred_float).astype(int)

In [60]:
p_pred_int = 4800 - s_pred_int

In [74]:
pd.DataFrame(data={"E_mu_Z": E_mu_Z_pred, "R": R_pred, "s": s_pred_int, "p": p_pred_int}).to_csv(
    "submission.csv", header=False, index=False
)